Perl regex syntax generation
This is a follow up to the question posted here: Perl Regex syntax
The results from that discussion yielded this script:
#!/usr/bin/env perl
use strict;
use warnings;
my @lines = <DATA>;
my $current_label = '';
my @ordered_labels;
my %data;
for my $line (@lines) {
if ( $line =~ /^\/(.*)$/ ) { # starts with slash
$current_label = $1;
push @ordered_labels, $current_label;
next;
}
if ( length $current_label ) {
if ( $line =~ /^(\d) "(.*)"$/ ) {
$data{$current_label}{$1} = $2;
next;
}
}
}
for my $label ( @ordered_labels ) {
print "$label <- as.factor($label\n";
print " , levels= c(";
开发者_Python百科 print join(',',map { $_ } sort keys %{$data{$label}} );
print ")\n";
print " , labels= c(";
print join(',',
map { '"' . $data{$label}{$_} . '"' }
sort keys %{$data{$label}} );
print ")\n";
print " )\n";
}
__DATA__
...A bunch of nonsense I do not care about...
...
Value Labels
/gender
1 "M"
2 "F"
/purpose
1 "business"
2 "vacation"
3 "tiddlywinks"
execute .
Essentially, I need to build the Perl to accommodate a syntax shorthand found in the SPSS file. For adjacent columns, SPSS allows one to type something like:
VALUE LABELS
/agree1 to agree5
1 "Strongly disagree"
2 "Disagree"
3 "Neutral"
4 "Agree"
5 "Strongly agree"
As the script currently exists, it will generate this:
agree1 to agree5 <- factor(agree1 to agree5
, levels= c(1,2,3,4,5,6)
, labels= c("Strongly disagree","Disagree","Neutral","Agree","Strongly agree","N/A")
)
and I need it to produce something like this:
agree1 <- factor(agree1
, levels= c(1,2,3,4,5,6)
, labels= c("Strongly disagree","Disagree","Neutral","Agree","Strongly agree","N/A")
)
agree2 <- factor(agree2
, levels= c(1,2,3,4,5,6)
, labels= c("Strongly disagree","Disagree","Neutral","Agree","Strongly agree","N/A")
)
…
use strict;
use warnings;
main();
sub main {
my @lines = <DATA>;
my $vlabels = get_value_labels(@lines);
write_output_delim($vlabels);
}
# Extract the value label information from SPSS syntax.
sub get_value_labels {
my (@vlabels, $i, $j);
for my $line (@_){
if ( $line =~ /^\/(.+)/ ){
my @vars = parse_var_range($1);
$i = @vlabels;
$j = $i + @vars - 1;
push @vlabels, { var => $_, codes => [] } for @vars;
}
elsif ( $line =~ /^\s* (\d) \s+ "(.*)"$/x ){
push @{$vlabels[$_]{codes}}, [$1, $2] for $i .. $j;
}
}
return \@vlabels;
}
# A helper function to handle variable ranges: "agree1 to agree3".
sub parse_var_range {
my $vr = shift;
my @vars = split /\s+ to \s+/x, $vr;
return $vr unless @vars > 1;
my ($stem) = $vars[0] =~ /(.+?)\d+$/;
my @n = map { /(\d+)$/ } @vars;
return map { "$stem" . $_ } $n[0] .. $n[1];
}
sub write_output_delim {
my $vlabels = shift;
for my $vlab (@$vlabels){
print $vlab->{var}, "\n";
print join("\t", '', @$_), "\n" for @{$vlab->{codes}}
}
}
sub write_output_factors {
# You get the idea...
}
__DATA__
/gender
1 "M"
2 "F"
/purpose
1 "business"
2 "vacation"
3 "tiddlywinks"
/agree1 to agree3
1 "Disagree"
2 "Neutral"
3 "Agree"
精彩评论