perlgrok.pl: sanitize labels even if there was no macro replacement (manual capture groups)
#!/usr/bin/perl
# $Id: perlgrok.pl,v 43b8b0709ac5 2021/10/02 13:03:51 grin $
#
# grok parser in perl
#
# (c)Peter 'grin' Gervai, 2021; CC_BY-SA-4.0 / GPLv3+
# grin*grin.hu
#
# This has been thrown together within half an hour, so... beware of the rough edges.
use warnings;
use strict;
use IO::File;
use Getopt::Long;
# pattern filename
my @fn_patterns = ();
# top pattern we resolve
my $top = "TEST";
# debug level
my $DEBUG=1;
my $fn_test = "-"; # STDIN
my $fn_fail;
my $fn_parsed = "-"; # SDTOUT
my %input;
my %processing;
my $help = 0;
my $re_trace = 1; # check all regexp
my %xlate; # perl names don't support period in name, so we need to translate
GetOptions(
"input=s" => \$fn_test,
"pattern=s" => \@fn_patterns,
"top=s" => \$top,
"parsed=s" => \$fn_parsed,
"failed=s" => \$fn_fail,
"debug=i" => \$DEBUG,
"help" => \$help,
) or die "error in command line arguments";
if( $help ) {
my $VER = '$Id: perlgrok.pl,v 43b8b0709ac5 2021/10/02 13:03:51 grin $';
print "$0 $VER (c) Peter 'grin' Gervai, 2021\n\n";
print "Usage: $0 [--input input_file] [--pattern pattern_file] [--pattern another] [--top final_macro] \n";
print " [--parsed parsed_outfile] [--failed failed_lines_file] [--debug n] [--help]\n";
print " defaults are stdin, stdout, 'patterns' as patternfile and 'TEST' as final macro.\n\n";
exit;
}
if( $#fn_patterns < 0 ) {
push @fn_patterns, "patterns";
}
# read patterns
foreach my $fn_pattern (@fn_patterns) {
my $f = new IO::File "< $fn_pattern" or die "cannot open pattern $fn_pattern: $!";
&d(7, "Reading $fn_pattern pattern file contents");
while( <$f> ) {
chomp;
# skip empty and comment lines
next if /^ *(#|$)/;
# parse <name><space><content>
if( /^(\S+) +(.+)$/ ) {
if( defined( $input{$1} ) ) {
&d(1, "Warning: duplicate macro $1 in $fn_pattern; overwrites previous version!");
}
$input{$1} = $2;
} else {
die "Patternfile $fn_pattern parse error:\n$_\n\n";
}
}
$f->close;
}
&d(5, "Start program");
&resolve( $top, '<root>' );
my $res = $input{$top};
&d(9, "RESULT\n========\n${top} = $res" );
&d(9, "Pattern length=" . length($res) );
## test final pattern
eval {
my $test = qr($res);
};
if( $@ ) {
print "ERR: $@\n\n";
exit;
}
### read file
my $f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
my $ff;
if( $fn_fail ) { $ff = new IO::File "> $fn_fail" or die "cannot open $fn_fail for writing: $!"; }
my $fout = new IO::File "> $fn_parsed" or die "cannot open output for parsed: $!";
while( <$f> ) {
chomp;
&d(5, "LINE:$_");
if( /$res/ ) {
# there is a match, print parsed values
print $fout "$_\n";
for my $k ( sort keys %+ ) {
print $fout "\t".&unxlate($k)." => '$+{$k}'\n";
}
} else {
if( $fn_fail ) {
print $ff "$_\n" if $fn_fail ;
} else {
print "$_\n";
print "\t-no match-\n";
}
}
}
$fout->close;
$f->close;
$ff->close if $fn_fail;
exit;
#########################################################################################################
# fully resolve a macro line (recursively)
sub resolve {
my ($key, $caller_key) = @_;
&d( 7, "Called resolve for $key from $caller_key");
# get macro data from %input
my $data = $input{$key};
if( $processing{$key} ) {
# already processing this key: looping
die "Resolving loop in $key: already processing for $processing{$key}! Called from $caller_key.";
}
$processing{$key} = $caller_key;
&d( 7, "Resolve_all_macros loop in $key");
# while we have unresolved macros, walk and resolve them all
while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w.\d]+))?\}/ ) {
my $r_key = $+{k};
my $r_name = $+{v} || ""; # this is just informational here
&d(7, "Resolve macro in $key for '$r_key' (name $r_name) in data /$data/" );
# do we know the macro?
if( defined( $input{ $r_key } ) ) {
# resolve everything in the macro
&resolve( $r_key, $key );
# replace fully resolved macro in $data
&replace_all( \$data, $r_key );
} else {
die "Unknown macro '$r_key'! Called from $caller_key.";
}
}
# sanitize macros where no resolve was necessary but manual entries may still screw perl up
&sanitize_labels( \$data );
# update macro in %input with fully resolved line
$input{$key} = $data;
&d(7, "RESOLVED '$key'! Result=/$data/");
if( $re_trace ) {
# die if regexp is bad
my $test = qr($data);
}
# release loop protection for $key
delete $processing{$key};
}
## replace all macros with replacement [macro-free] regexp
sub replace_all {
my ($data, $key) = @_;
&d(7, "Replace_all $key in /$$data/");
# while we have this macro in $data somewhere, repeat...
while( $$data =~ /\{${key}(:(?<v>[_\w.\d]+))?\}/ ) {
my $mname = $+{v};
my $inner = $input{$key}; # shall be already resolved, may be checked (FIXME)
&d(7, " Found '$key'");
my $xname = $mname;
if( $mname ) {
# if it's not perl compatible, create a translation entry
if( $mname !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
$xname = &gen_xlate_entry($mname);
}
# named macro, create a named capture group
$$data =~ s/%\{${key}:${mname}\}/(?<${xname}>${inner})/g;
&d(7, " Replace $key:$mname with named capture group /$inner/");
} else {
# unnamed macro, we just throw it in (trust no1 and parenthesize it)
$$data =~ s/%\{${key}\}/(?:${inner})/g;
&d(7, " Replace $key with /$inner/");
}
&sanitize_labels( $data );
&d(9, " Replacement result is /$$data/");
}
&d(7, "REPLACED ALL '$key' in /$$data/");
}
sub gen_xlate_entry {
my ($mname) = @_;
&d(7, "Generate a translation variable name for $mname");
my $xname = "xlate_" . int(rand(1_000_000));
my $count = 100_000;
while( defined( $xlate{$xname} ) ) {
$xname = "xlate_" . int(rand(1_000_000));
die "Cannot generate unique translation for $mname in a _lot_ of steps" if --$count<=0;
}
&d(7, "Generated $xname as the name for $mname");
$xlate{$xname} = $mname;
return $xname;
}
# find ?<names> which perl incompatible, and xlate them forcibly
sub sanitize_labels {
my ($data) = @_;
if( $$data =~ /(?<!\\)\((?!\\)\?<([_A-Za-z][_A-Za-z0-9]*[^>_A-Za-z0-9])/ ) {
&d(7, "Irregular name in $$data, create translation");
# we have at least one irregular, get all labels
while( $$data =~ /(?<!\\)\((?!\\)\?<(.+?)>/g ) {
my $label = $1;
&d(9, "Got label $label (***BEFORE $` ##<->## $' AFTER***)");
if( $label !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
# not perl compatible, translate and replace
my $xname = &gen_xlate_entry($label);
$$data =~ s/(?<!\\)\((?!\\)\?<${label}>/(?<${xname}>/g;
&d(5, "ForceReplaced label '$label' with '$xname' in /$$data/");
}
}
}
}
sub unxlate {
my ($xkey) = @_;
return $xlate{$xkey};
}
# extremely low-key debug subsystem :-)
sub d {
my ($lvl, $s) = @_;
return if $lvl > $DEBUG;
print scalar(localtime) . " [$$] ($lvl) $s\n";
}