How can I extract the nth occurrence of a match in a Perl regex?
Is it possible to extract the nth match in a string of single-quoted words?
use strict;
use warnings;
my $string1 = "'I want to' 'extract the word' 'Perl','f开发者_StackOverflowrom this string'";
my $string2 = "'What about','getting','Perl','from','here','?'";
sub extract_quoted {
my ($string, $index) = @_;
my ($wanted) = $string =~ /some_regex_using _$index/;
return $wanted;
}
extract_wanted ($string1, 3); # Should return 'Perl', with quotes
extract_wanted ($string2, 3); # Should return 'Perl', with quotes
This ought to work:
sub extract_quoted {
my ($string, $index) = @_;
my $wanted = ($string =~ /'(.*?)'/g)[$index];
return $wanted;
}
See this question: How do I save matched parts of a regex in Perl? and this answer (the /g switch is the trick)
The match-g operator evaluated in an array context yields an array of matches. Therefore:
@matches = $string =~ /'(.*?)'/g;
$matches[$index-1];
is one way to get what you want.
You can try:
sub extract_quoted {
my ($string, $index) = @_;
while($string =~ /'(.*?)'/g) {
$index--;
return $1 if(! $index); # return $1 if index became 0.
}
return; # not found - returns undef or () depending on context.
}
use strict; use warnings;
use Text::ParseWords;
my $string1 = q{'I want to' 'extract the word' 'Perl','from this string'};
my $string2 = q{'What about', 'getting','Perl','from','here','?'};
print extract_wanted($_, 3), "\n" for ($string1, $string2);
sub extract_wanted {
my ($string, $index) = @_;
my $wanted = (parse_line '(:?\s|,)+', 0, $string)[$index - 1];
return unless defined $wanted;
return $wanted;
}
Output:
Perl Perl
It may look ugly, but
my $quoted = qr/'[^']+'/; # ' fix Stackoverflow highlighting
my %_extract_wanted_cache;
sub extract_wanted_memo {
my($string, $index) = @_;
$string =~ ($_extract_wanted_cache{$index} ||=
qr/^(?:.*?$quoted.*?){@{[$index-1]}}($quoted)/)
? $1
: ();
}
benchmarking suggests it might be worthwhile:
sub extract_wanted {
my($string, $index) = @_;
$string =~ /^(?:.*?$quoted.*?){@{[$index-1]}}($quoted)/
? $1
: ();
}
sub extract_wanted_gindex {
my($string, $index) = @_;
($string =~ /$quoted/g)[$index-1];
}
use Benchmark;
timethese -1 => {
nocache => sub { extract_wanted $string2, 3 },
memoize => sub { extract_wanted_memo $string2, 3 },
index => sub { extract_wanted_gindex $string2, 3 },
nocache_fail => sub { extract_wanted $string2, 100 },
memoize_fail => sub { extract_wanted_memo $string2, 100 },
index_fail => sub { extract_wanted_gindex $string2, 100 },
}
Results:
Benchmark: running index, index_fail, memoize, memoize_fail, nocache, nocache_fail for at least 1 CPU seconds ... index: 1 w/c secs (1.04 usr + 0.00 sys = 1.04 CPU) @183794.23/s (n=191146) index_fail: 1 w/c secs (1.03 usr + 0.00 sys = 1.03 CPU) @185578.64/s (n=191146) memoize: 1 w/c secs (1.00 usr + 0.00 sys = 1.00 CPU) @264664.00/s (n=264664) memoize_fail: 0 w/c secs (1.03 usr + 0.00 sys = 1.03 CPU) @835106.80/s (n=860160) nocache: 0 w/c secs (1.03 usr + 0.00 sys = 1.03 CPU) @196495.15/s (n=202390) nocache_fail: 2 w/c secs (1.03 usr + 0.00 sys = 1.03 CPU) @445390.29/s (n=458752)
sub extract_quoted {
my ($string,$index) = @_;
$c=1;
@s = split /\'/,$string;
for($i=1;$i<=$#s;$i+=2){
$c==$index && return $s[$i];
$c++;
}
}
精彩评论