--- a/perlgrok.pl Thu Sep 30 00:30:07 2021 +0200
+++ b/perlgrok.pl Thu Sep 30 15:30:10 2021 +0200
@@ -1,5 +1,5 @@
#!/usr/bin/perl
-# $Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $
+# $Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $
#
# grok parser in perl
#
@@ -14,7 +14,7 @@
use Getopt::Long;
# pattern filename
-my $fn_pattern = "patterns";
+my @fn_patterns = ();
# top pattern we resolve
my $top = "TEST";
@@ -28,10 +28,13 @@
my %input;
my %processing;
my $help = 0;
+my $re_trace = 1; # check all regexp
+
+my %xlate; # perl names don't support period in name, so we need to translate
GetOptions(
"input=s" => \$fn_test,
- "pattern=s" => \$fn_pattern,
+ "pattern=s" => \@fn_patterns,
"top=s" => \$top,
"parsed=s" => \$fn_parsed,
"failed=s" => \$fn_fail,
@@ -40,29 +43,40 @@
) or die "error in command line arguments";
if( $help ) {
- my $VER = '$Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $';
+ my $VER = '$Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $';
print "$0 $VER (c) Peter 'grin' Gervai, 2021\n\n";
- print "Usage: $0 [--input input_file] [--pattern pattern_file] [--top final_macro] \n";
+ print "Usage: $0 [--input input_file] [--pattern pattern_file] [--pattern another] [--top final_macro] \n";
print " [--parsed parsed_outfile] [--failed failed_lines_file] [--debug n] [--help]\n";
print " defaults are stdin, stdout, 'patterns' as patternfile and 'TEST' as final macro.\n\n";
exit;
}
+if( $#fn_patterns < 0 ) {
+ push @fn_patterns, "patterns";
+}
+
+
# read patterns
-my $f = new IO::File "< $fn_pattern" or die "cannot open patterns: $!";
-while( <$f> ) {
- chomp;
- # skip empty and comment lines
- next if /^ *(#|$)/;
-
- # parse <name><space><content>
- if( /^(\S+) +(.+)$/ ) {
- $input{$1} = $2;
- } else {
- die "Patternfile parse error:\n$_\n\n";
+foreach my $fn_pattern (@fn_patterns) {
+ my $f = new IO::File "< $fn_pattern" or die "cannot open pattern $fn_pattern: $!";
+ &d(7, "Reading $fn_pattern pattern file contents");
+ while( <$f> ) {
+ chomp;
+ # skip empty and comment lines
+ next if /^ *(#|$)/;
+
+ # parse <name><space><content>
+ if( /^(\S+) +(.+)$/ ) {
+ if( defined( $input{$1} ) ) {
+ &d(1, "Warning: duplicate macro $1 in $fn_pattern; overwrites previous version!");
+ }
+ $input{$1} = $2;
+ } else {
+ die "Patternfile $fn_pattern parse error:\n$_\n\n";
+ }
}
+ $f->close;
}
-$f->close;
&d(5, "Start program");
&resolve( $top, '<root>' );
@@ -71,8 +85,17 @@
&d(9, "RESULT\n========\n${top} = $res" );
&d(9, "Pattern length=" . length($res) );
+eval {
+ my $test = "abc123";
+ $test =~ /$res/;
+};
+if( $@ ) {
+ print "ERR: $@\n\n";
+ exit;
+}
+
### read file
-$f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
+my $f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
my $ff;
if( $fn_fail ) { $ff = new IO::File "> $fn_fail" or die "cannot open $fn_fail for writing: $!"; }
my $fout = new IO::File "> $fn_parsed" or die "cannot open output for parsed: $!";
@@ -84,7 +107,7 @@
# there is a match, print parsed values
print $fout "$_\n";
for my $k ( sort keys %+ ) {
- print $fout "\t$k => '$+{$k}'\n";
+ print $fout "\t".&unxlate($k)." => '$+{$k}'\n";
}
} else {
if( $fn_fail ) {
@@ -121,7 +144,7 @@
&d( 7, "Resolve_all_macros loop in $key");
# while we have unresolved macros, walk and resolve them all
- while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w\d]+))?\}/ ) {
+ while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w.\d]+))?\}/ ) {
my $r_key = $+{k};
my $r_name = $+{v} || ""; # this is just informational here
@@ -143,6 +166,11 @@
$input{$key} = $data;
&d(7, "RESOLVED '$key'! Result=/$data/");
+ if( $re_trace ) {
+ # die if regexp is bad
+ "abc" =~ /$data/;
+ }
+
# release loop protection for $key
delete $processing{$key};
}
@@ -154,15 +182,20 @@
&d(7, "Replace_all $key in /$$data/");
# while we have this macro in $data somewhere, repeat...
- while( $$data =~ /\{${key}(:(?<v>[_\w\d]+))?\}/ ) {
+ while( $$data =~ /\{${key}(:(?<v>[_\w.\d]+))?\}/ ) {
my $mname = $+{v};
my $inner = $input{$key}; # shall be already resolved, may be checked (FIXME)
&d(7, " Found '$key'");
+ my $xname = $mname;
if( $mname ) {
+ # if it's not perl compatible, create a translation entry
+ if( $mname !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
+ $xname = &gen_xlate_entry($mname);
+ }
# named macro, create a named capture group
- $$data =~ s/%\{${key}:${mname}\}/(?<${mname}>${inner})/g;
+ $$data =~ s/%\{${key}:${mname}\}/(?<${xname}>${inner})/g;
&d(7, " Replace $key:$mname with named capture group /$inner/");
} else {
# unnamed macro, we just throw it in (trust no1 and parenthesize it)
@@ -170,12 +203,57 @@
&d(7, " Replace $key with /$inner/");
}
+ &sanitize_labels( $data );
+
&d(9, " Replacement result is /$$data/");
}
&d(7, "REPLACED ALL '$key' in /$$data/");
}
+sub gen_xlate_entry {
+ my ($mname) = @_;
+ &d(7, "Generate a translation variable name for $mname");
+ my $xname = "xlate_" . int(rand(1_000_000));
+ my $count = 100_000;
+ while( defined( $xlate{$xname} ) ) {
+ $xname = "xlate_" . int(rand(1_000_000));
+ die "Cannot generate unique translation for $mname in a _lot_ of steps" if --$count<=0;
+ }
+
+ &d(7, "Generated $xname as the name for $mname");
+ $xlate{$xname} = $mname;
+ return $xname;
+}
+
+
+# find ?<names> which perl incompatible, and xlate them forcibly
+sub sanitize_labels {
+ my ($data) = @_;
+
+ if( $$data =~ /(?<!\\)\((?!\\)\?<([_A-Za-z][_A-Za-z0-9]*[^>_A-Za-z0-9])/ ) {
+ &d(7, "Irregular name in $$data, create translation");
+ # we have at least one irregular, get all labels
+ while( $$data =~ /(?<!\\)\((?!\\)\?<(.+?)>/g ) {
+ my $label = $1;
+ &d(9, "Got label $label (***BEFORE $` ##<->## $' AFTER***)");
+ if( $label !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
+ # not perl compatible, translate and replace
+ my $xname = &gen_xlate_entry($label);
+ $$data =~ s/(?<!\\)\((?!\\)\?<${label}>/(?<${xname}>/g;
+ &d(5, "ForceReplaced label '$label' with '$xname' in /$$data/");
+ }
+ }
+ }
+}
+
+
+sub unxlate {
+ my ($xkey) = @_;
+ return $xlate{$xkey};
+}
+
+
# extremely low-key debug subsystem :-)
sub d {
my ($lvl, $s) = @_;