perlgrok.pl
changeset 8 31c4ce4d9b73
parent 7 c4085453688b
child 9 f1cb959b4603
--- a/perlgrok.pl	Thu Sep 30 00:30:07 2021 +0200
+++ b/perlgrok.pl	Thu Sep 30 15:30:10 2021 +0200
@@ -1,5 +1,5 @@
 #!/usr/bin/perl
-# $Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $
+# $Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $
 #
 # grok parser in perl
 #
@@ -14,7 +14,7 @@
 use Getopt::Long;
 
 # pattern filename
-my $fn_pattern = "patterns";
+my @fn_patterns = ();
 
 # top pattern we resolve
 my $top = "TEST";
@@ -28,10 +28,13 @@
 my %input;
 my %processing;
 my $help = 0;
+my $re_trace = 1; # check all regexp
+
+my %xlate;  # perl names don't support period in name, so we need to translate
 
 GetOptions( 
     "input=s"       => \$fn_test,
-    "pattern=s"     => \$fn_pattern,
+    "pattern=s"     => \@fn_patterns,
     "top=s"         => \$top,
     "parsed=s"      => \$fn_parsed,
     "failed=s"      => \$fn_fail,
@@ -40,29 +43,40 @@
 ) or die "error in command line arguments";
 
 if( $help ) {
-    my $VER = '$Id: perlgrok.pl,v bca12968c007 2021/09/29 20:31:56 grin $';
+    my $VER = '$Id: perlgrok.pl,v d6e64daafdc1 2021/09/29 23:46:59 grin $';
     print "$0 $VER (c) Peter 'grin' Gervai, 2021\n\n";
-    print "Usage: $0 [--input input_file] [--pattern pattern_file] [--top final_macro] \n";
+    print "Usage: $0 [--input input_file] [--pattern pattern_file] [--pattern another] [--top final_macro] \n";
     print "          [--parsed parsed_outfile] [--failed failed_lines_file] [--debug n] [--help]\n";
     print "       defaults are stdin, stdout, 'patterns' as patternfile and 'TEST' as final macro.\n\n";
     exit;
 }
 
+if( $#fn_patterns < 0 ) {
+    push @fn_patterns, "patterns";
+}
+
+
 # read patterns
-my $f = new IO::File "< $fn_pattern" or die "cannot open patterns: $!";
-while( <$f> ) {
-    chomp;
-    # skip empty and comment lines
-    next if /^ *(#|$)/;
-    
-    # parse <name><space><content>
-    if( /^(\S+) +(.+)$/ ) {
-        $input{$1} = $2;
-    } else {
-        die "Patternfile parse error:\n$_\n\n";
+foreach my $fn_pattern (@fn_patterns) {
+    my $f = new IO::File "< $fn_pattern" or die "cannot open pattern $fn_pattern: $!";
+    &d(7, "Reading $fn_pattern pattern file contents");
+    while( <$f> ) {
+        chomp;
+        # skip empty and comment lines
+        next if /^ *(#|$)/;
+        
+        # parse <name><space><content>
+        if( /^(\S+) +(.+)$/ ) {
+            if( defined( $input{$1} ) ) {
+                &d(1, "Warning: duplicate macro $1 in $fn_pattern; overwrites previous version!");
+            }
+            $input{$1} = $2;
+        } else {
+            die "Patternfile $fn_pattern parse error:\n$_\n\n";
+        }
     }
+    $f->close;
 }
-$f->close;
 
 &d(5, "Start program");
 &resolve( $top, '<root>' );
@@ -71,8 +85,17 @@
 &d(9, "RESULT\n========\n${top} = $res" );
 &d(9, "Pattern length=" . length($res) );
 
+eval {
+    my $test = "abc123";
+    $test =~ /$res/;
+};
+if( $@ ) {
+    print "ERR: $@\n\n";
+    exit;
+}
+
 ### read file
-$f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
+my $f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
 my $ff;
 if( $fn_fail ) { $ff = new IO::File "> $fn_fail" or die "cannot open $fn_fail for writing: $!"; }
 my $fout = new IO::File "> $fn_parsed" or die "cannot open output for parsed: $!";
@@ -84,7 +107,7 @@
         # there is a match, print parsed values
         print $fout "$_\n";
         for my $k ( sort keys %+ ) {
-            print $fout "\t$k => '$+{$k}'\n";
+            print $fout "\t".&unxlate($k)." => '$+{$k}'\n";
         }
     } else {
         if( $fn_fail ) {
@@ -121,7 +144,7 @@
     
     &d( 7, "Resolve_all_macros loop in $key");
     # while we have unresolved macros, walk and resolve them all
-    while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w\d]+))?\}/ ) {
+    while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w.\d]+))?\}/ ) {
         my $r_key = $+{k};
         my $r_name = $+{v} || ""; # this is just informational here
         
@@ -143,6 +166,11 @@
     $input{$key} = $data;
     &d(7, "RESOLVED '$key'! Result=/$data/");
 
+    if( $re_trace ) {
+        # die if regexp is bad
+        "abc" =~ /$data/;
+    }
+
     # release loop protection for $key
     delete $processing{$key};
 }
@@ -154,15 +182,20 @@
     
     &d(7, "Replace_all $key in /$$data/");
     # while we have this macro in $data somewhere, repeat...
-    while( $$data =~ /\{${key}(:(?<v>[_\w\d]+))?\}/ ) {
+    while( $$data =~ /\{${key}(:(?<v>[_\w.\d]+))?\}/ ) {
         my $mname = $+{v};
         my $inner = $input{$key};	# shall be already resolved, may be checked (FIXME)
         
         &d(7, " Found '$key'");
         
+        my $xname = $mname;
         if( $mname ) {
+            # if it's not perl compatible, create a translation entry
+            if( $mname !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
+                $xname = &gen_xlate_entry($mname);
+            }
             # named macro, create a named capture group
-            $$data =~ s/%\{${key}:${mname}\}/(?<${mname}>${inner})/g;
+            $$data =~ s/%\{${key}:${mname}\}/(?<${xname}>${inner})/g;
             &d(7, "  Replace $key:$mname with named capture group /$inner/");
         } else {
             # unnamed macro, we just throw it in (trust no1 and parenthesize it)
@@ -170,12 +203,57 @@
             &d(7, "  Replace $key with /$inner/");
         }
         
+        &sanitize_labels( $data );
+
         &d(9, "   Replacement result is /$$data/");
     }
     &d(7, "REPLACED ALL '$key' in /$$data/");
 }
 
 
+sub gen_xlate_entry {
+    my ($mname) = @_;
+    &d(7, "Generate a translation variable name for $mname");
+    my $xname = "xlate_" . int(rand(1_000_000));
+    my $count = 100_000;
+    while( defined( $xlate{$xname} ) ) {
+        $xname = "xlate_" . int(rand(1_000_000));
+        die "Cannot generate unique translation for $mname in a _lot_ of steps" if --$count<=0;
+    }
+    
+    &d(7, "Generated $xname as the name for $mname");
+    $xlate{$xname} = $mname;
+    return $xname;
+}
+
+
+# find ?<names> which perl incompatible, and xlate them forcibly
+sub sanitize_labels {
+    my ($data) = @_;
+
+    if( $$data =~ /(?<!\\)\((?!\\)\?<([_A-Za-z][_A-Za-z0-9]*[^>_A-Za-z0-9])/ ) {
+        &d(7, "Irregular name in $$data, create translation");
+        # we have at least one irregular, get all labels
+        while( $$data =~ /(?<!\\)\((?!\\)\?<(.+?)>/g ) {
+            my $label = $1;
+            &d(9, "Got label $label (***BEFORE $` ##<->## $' AFTER***)");
+            if( $label !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
+                # not perl compatible, translate and replace
+                my $xname = &gen_xlate_entry($label);
+                $$data =~ s/(?<!\\)\((?!\\)\?<${label}>/(?<${xname}>/g;
+                &d(5, "ForceReplaced label '$label' with '$xname' in /$$data/");
+            }
+        }
+    }
+}
+
+
+sub unxlate {
+    my ($xkey) = @_;
+    return $xlate{$xkey};
+}
+
+
 # extremely low-key debug subsystem :-)
 sub d {
     my ($lvl, $s) = @_;