diff -r c4085453688b -r 31c4ce4d9b73 perlgrok.pl --- a/perlgrok.pl Thu Sep 30 00:30:07 2021 +0200 +++ b/perlgrok.pl Thu Sep 30 15:30:10 2021 +0200 @@ -1,5 +1,5 @@ #!/usr/bin/perl -# $Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $ +# $Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $ # # grok parser in perl # @@ -14,7 +14,7 @@ use Getopt::Long; # pattern filename -my $fn_pattern = "patterns"; +my @fn_patterns = (); # top pattern we resolve my $top = "TEST"; @@ -28,10 +28,13 @@ my %input; my %processing; my $help = 0; +my $re_trace = 1; # check all regexp + +my %xlate; # perl names don't support period in name, so we need to translate GetOptions( "input=s" => \$fn_test, - "pattern=s" => \$fn_pattern, + "pattern=s" => \@fn_patterns, "top=s" => \$top, "parsed=s" => \$fn_parsed, "failed=s" => \$fn_fail, @@ -40,29 +43,40 @@ ) or die "error in command line arguments"; if( $help ) { - my $VER = '$Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $'; + my $VER = '$Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $'; print "$0 $VER (c) Peter 'grin' Gervai, 2021\n\n"; - print "Usage: $0 [--input input_file] [--pattern pattern_file] [--top final_macro] \n"; + print "Usage: $0 [--input input_file] [--pattern pattern_file] [--pattern another] [--top final_macro] \n"; print " [--parsed parsed_outfile] [--failed failed_lines_file] [--debug n] [--help]\n"; print " defaults are stdin, stdout, 'patterns' as patternfile and 'TEST' as final macro.\n\n"; exit; } +if( $#fn_patterns < 0 ) { + push @fn_patterns, "patterns"; +} + + # read patterns -my $f = new IO::File "< $fn_pattern" or die "cannot open patterns: $!"; -while( <$f> ) { - chomp; - # skip empty and comment lines - next if /^ *(#|$)/; - - # parse - if( /^(\S+) +(.+)$/ ) { - $input{$1} = $2; - } else { - die "Patternfile parse error:\n$_\n\n"; +foreach my $fn_pattern (@fn_patterns) { + my $f = new IO::File "< $fn_pattern" or die "cannot open pattern $fn_pattern: $!"; + &d(7, "Reading $fn_pattern pattern file contents"); + while( <$f> ) { + chomp; + # skip empty and comment lines + next if /^ *(#|$)/; + + # parse + if( /^(\S+) +(.+)$/ ) { + if( defined( $input{$1} ) ) { + &d(1, "Warning: duplicate macro $1 in $fn_pattern; overwrites previous version!"); + } + $input{$1} = $2; + } else { + die "Patternfile $fn_pattern parse error:\n$_\n\n"; + } } + $f->close; } -$f->close; &d(5, "Start program"); &resolve( $top, '' ); @@ -71,8 +85,17 @@ &d(9, "RESULT\n========\n${top} = $res" ); &d(9, "Pattern length=" . length($res) ); +eval { + my $test = "abc123"; + $test =~ /$res/; +}; +if( $@ ) { + print "ERR: $@\n\n"; + exit; +} + ### read file -$f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!"; +my $f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!"; my $ff; if( $fn_fail ) { $ff = new IO::File "> $fn_fail" or die "cannot open $fn_fail for writing: $!"; } my $fout = new IO::File "> $fn_parsed" or die "cannot open output for parsed: $!"; @@ -84,7 +107,7 @@ # there is a match, print parsed values print $fout "$_\n"; for my $k ( sort keys %+ ) { - print $fout "\t$k => '$+{$k}'\n"; + print $fout "\t".&unxlate($k)." => '$+{$k}'\n"; } } else { if( $fn_fail ) { @@ -121,7 +144,7 @@ &d( 7, "Resolve_all_macros loop in $key"); # while we have unresolved macros, walk and resolve them all - while( $data =~ /%\{(?[A-Z_0-9]+)(:(?[_\w\d]+))?\}/ ) { + while( $data =~ /%\{(?[A-Z_0-9]+)(:(?[_\w.\d]+))?\}/ ) { my $r_key = $+{k}; my $r_name = $+{v} || ""; # this is just informational here @@ -143,6 +166,11 @@ $input{$key} = $data; &d(7, "RESOLVED '$key'! Result=/$data/"); + if( $re_trace ) { + # die if regexp is bad + "abc" =~ /$data/; + } + # release loop protection for $key delete $processing{$key}; } @@ -154,15 +182,20 @@ &d(7, "Replace_all $key in /$$data/"); # while we have this macro in $data somewhere, repeat... - while( $$data =~ /\{${key}(:(?[_\w\d]+))?\}/ ) { + while( $$data =~ /\{${key}(:(?[_\w.\d]+))?\}/ ) { my $mname = $+{v}; my $inner = $input{$key}; # shall be already resolved, may be checked (FIXME) &d(7, " Found '$key'"); + my $xname = $mname; if( $mname ) { + # if it's not perl compatible, create a translation entry + if( $mname !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) { + $xname = &gen_xlate_entry($mname); + } # named macro, create a named capture group - $$data =~ s/%\{${key}:${mname}\}/(?<${mname}>${inner})/g; + $$data =~ s/%\{${key}:${mname}\}/(?<${xname}>${inner})/g; &d(7, " Replace $key:$mname with named capture group /$inner/"); } else { # unnamed macro, we just throw it in (trust no1 and parenthesize it) @@ -170,12 +203,57 @@ &d(7, " Replace $key with /$inner/"); } + &sanitize_labels( $data ); + &d(9, " Replacement result is /$$data/"); } &d(7, "REPLACED ALL '$key' in /$$data/"); } +sub gen_xlate_entry { + my ($mname) = @_; + &d(7, "Generate a translation variable name for $mname"); + my $xname = "xlate_" . int(rand(1_000_000)); + my $count = 100_000; + while( defined( $xlate{$xname} ) ) { + $xname = "xlate_" . int(rand(1_000_000)); + die "Cannot generate unique translation for $mname in a _lot_ of steps" if --$count<=0; + } + + &d(7, "Generated $xname as the name for $mname"); + $xlate{$xname} = $mname; + return $xname; +} + + +# find ? which perl incompatible, and xlate them forcibly +sub sanitize_labels { + my ($data) = @_; + + if( $$data =~ /(?_A-Za-z0-9])/ ) { + &d(7, "Irregular name in $$data, create translation"); + # we have at least one irregular, get all labels + while( $$data =~ /(?/g ) { + my $label = $1; + &d(9, "Got label $label (***BEFORE $` ##<->## $' AFTER***)"); + if( $label !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) { + # not perl compatible, translate and replace + my $xname = &gen_xlate_entry($label); + $$data =~ s/(?/(?<${xname}>/g; + &d(5, "ForceReplaced label '$label' with '$xname' in /$$data/"); + } + } + } +} + + +sub unxlate { + my ($xkey) = @_; + return $xlate{$xkey}; +} + + # extremely low-key debug subsystem :-) sub d { my ($lvl, $s) = @_;