perlgrok.pl
author Peter Gervai <grin@grin.hu>
Wed, 15 May 2024 09:04:25 +0200
changeset 13 9ec5ecfe4347
parent 11 5c38653bd7e7
permissions -rwxr-xr-x
pwgen: add comment about entropy calculation

#!/usr/bin/perl
# $Id: perlgrok.pl,v 43b8b0709ac5 2021/10/02 13:03:51 grin $
#
# grok parser in perl
#
# (c)Peter 'grin' Gervai, 2021; CC_BY-SA-4.0 / GPLv3+
# grin*grin.hu
#
# This has been thrown together within half an hour, so... beware of the rough edges.

use warnings;
use strict;
use IO::File;
use Getopt::Long;

# pattern filename
my @fn_patterns = ();

# top pattern we resolve
my $top = "TEST";

# debug level
my $DEBUG=1;

my $fn_test = "-"; # STDIN
my $fn_fail;
my $fn_parsed = "-"; # SDTOUT
my %input;
my %processing;
my $help = 0;
my $re_trace = 1; # check all regexp

my %xlate;  # perl names don't support period in name, so we need to translate

GetOptions( 
    "input=s"       => \$fn_test,
    "pattern=s"     => \@fn_patterns,
    "top=s"         => \$top,
    "parsed=s"      => \$fn_parsed,
    "failed=s"      => \$fn_fail,
    "debug=i"       => \$DEBUG,
    "help"          => \$help,
) or die "error in command line arguments";

if( $help ) {
    my $VER = '$Id: perlgrok.pl,v 43b8b0709ac5 2021/10/02 13:03:51 grin $';
    print "$0 $VER (c) Peter 'grin' Gervai, 2021\n\n";
    print "Usage: $0 [--input input_file] [--pattern pattern_file] [--pattern another] [--top final_macro] \n";
    print "          [--parsed parsed_outfile] [--failed failed_lines_file] [--debug n] [--help]\n";
    print "       defaults are stdin, stdout, 'patterns' as patternfile and 'TEST' as final macro.\n\n";
    exit;
}

if( $#fn_patterns < 0 ) {
    push @fn_patterns, "patterns";
}


# read patterns
foreach my $fn_pattern (@fn_patterns) {
    my $f = new IO::File "< $fn_pattern" or die "cannot open pattern $fn_pattern: $!";
    &d(7, "Reading $fn_pattern pattern file contents");
    while( <$f> ) {
        chomp;
        # skip empty and comment lines
        next if /^ *(#|$)/;
        
        # parse <name><space><content>
        if( /^(\S+) +(.+)$/ ) {
            if( defined( $input{$1} ) ) {
                &d(1, "Warning: duplicate macro $1 in $fn_pattern; overwrites previous version!");
            }
            $input{$1} = $2;
        } else {
            die "Patternfile $fn_pattern parse error:\n$_\n\n";
        }
    }
    $f->close;
}

&d(5, "Start program");
&resolve( $top, '<root>' );

my $res = $input{$top};
&d(9, "RESULT\n========\n${top} = $res" );
&d(9, "Pattern length=" . length($res) );

## test final pattern
eval {
    my $test = qr($res);
};
if( $@ ) {
    print "ERR: $@\n\n";
    exit;
}

### read file
my $f = new IO::File "< $fn_test" or die "cannot open $fn_test: $!";
my $ff;
if( $fn_fail ) { $ff = new IO::File "> $fn_fail" or die "cannot open $fn_fail for writing: $!"; }
my $fout = new IO::File "> $fn_parsed" or die "cannot open output for parsed: $!";

while( <$f> ) {
    chomp;
    &d(5, "LINE:$_");
    if( /$res/ ) {
        # there is a match, print parsed values
        print $fout "$_\n";
        for my $k ( sort keys %+ ) {
            print $fout "\t".&unxlate($k)." => '$+{$k}'\n";
        }
    } else {
        if( $fn_fail ) {
            print $ff "$_\n"  if $fn_fail ;
        } else {
            print "$_\n";
            print "\t-no match-\n";
        }
    }
}

$fout->close;
$f->close;
$ff->close if $fn_fail;

exit;


#########################################################################################################

# fully resolve a macro line (recursively)
sub resolve {
    my ($key, $caller_key) = @_;
    
    &d( 7, "Called resolve for $key from $caller_key");
    
    # get macro data from %input
    my $data = $input{$key};
    if( $processing{$key} ) {
        # already processing this key: looping
        die "Resolving loop in $key: already processing for $processing{$key}! Called from $caller_key.";
    }
    $processing{$key} = $caller_key;
    
    &d( 7, "Resolve_all_macros loop in $key");
    # while we have unresolved macros, walk and resolve them all
    while( $data =~ /%\{(?<k>[A-Z_0-9]+)(:(?<v>[_\w.\d]+))?\}/ ) {
        my $r_key = $+{k};
        my $r_name = $+{v} || ""; # this is just informational here
        
        &d(7, "Resolve macro in $key for '$r_key' (name $r_name) in data /$data/" );
        
        # do we know the macro?
        if( defined( $input{ $r_key } ) ) {
            # resolve everything in the macro
            &resolve( $r_key, $key );
            # replace fully resolved macro in $data
            &replace_all( \$data, $r_key );
            
        } else {
            die "Unknown macro '$r_key'! Called from $caller_key.";
        }
    }
    
    # sanitize macros where no resolve was necessary but manual entries may still screw perl up
    &sanitize_labels( \$data );
    
    # update macro in %input with fully resolved line
    $input{$key} = $data;
    &d(7, "RESOLVED '$key'! Result=/$data/");

    if( $re_trace ) {
        # die if regexp is bad
        my $test = qr($data);
    }

    # release loop protection for $key
    delete $processing{$key};
}


## replace all macros with replacement [macro-free] regexp
sub replace_all {
    my ($data, $key) = @_;
    
    &d(7, "Replace_all $key in /$$data/");
    # while we have this macro in $data somewhere, repeat...
    while( $$data =~ /\{${key}(:(?<v>[_\w.\d]+))?\}/ ) {
        my $mname = $+{v};
        my $inner = $input{$key};	# shall be already resolved, may be checked (FIXME)
        
        &d(7, " Found '$key'");
        
        my $xname = $mname;
        if( $mname ) {
            # if it's not perl compatible, create a translation entry
            if( $mname !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
                $xname = &gen_xlate_entry($mname);
            }
            # named macro, create a named capture group
            $$data =~ s/%\{${key}:${mname}\}/(?<${xname}>${inner})/g;
            &d(7, "  Replace $key:$mname with named capture group /$inner/");
        } else {
            # unnamed macro, we just throw it in (trust no1 and parenthesize it)
            $$data =~  s/%\{${key}\}/(?:${inner})/g;
            &d(7, "  Replace $key with /$inner/");
        }
        
        &sanitize_labels( $data );

        &d(9, "   Replacement result is /$$data/");
    }
    &d(7, "REPLACED ALL '$key' in /$$data/");
}


sub gen_xlate_entry {
    my ($mname) = @_;
    &d(7, "Generate a translation variable name for $mname");
    my $xname = "xlate_" . int(rand(1_000_000));
    my $count = 100_000;
    while( defined( $xlate{$xname} ) ) {
        $xname = "xlate_" . int(rand(1_000_000));
        die "Cannot generate unique translation for $mname in a _lot_ of steps" if --$count<=0;
    }
    
    &d(7, "Generated $xname as the name for $mname");
    $xlate{$xname} = $mname;
    return $xname;
}


# find ?<names> which perl incompatible, and xlate them forcibly
sub sanitize_labels {
    my ($data) = @_;

    if( $$data =~ /(?<!\\)\((?!\\)\?<([_A-Za-z][_A-Za-z0-9]*[^>_A-Za-z0-9])/ ) {
        &d(7, "Irregular name in $$data, create translation");
        # we have at least one irregular, get all labels
        while( $$data =~ /(?<!\\)\((?!\\)\?<(.+?)>/g ) {
            my $label = $1;
            &d(9, "Got label $label (***BEFORE $` ##<->## $' AFTER***)");
            if( $label !~ /^[_A-Za-z][_A-Za-z0-9]*\z/ ) {
                # not perl compatible, translate and replace
                my $xname = &gen_xlate_entry($label);
                $$data =~ s/(?<!\\)\((?!\\)\?<${label}>/(?<${xname}>/g;
                &d(5, "ForceReplaced label '$label' with '$xname' in /$$data/");
            }
        }
    }
}


sub unxlate {
    my ($xkey) = @_;
    return $xlate{$xkey};
}


# extremely low-key debug subsystem :-)
sub d {
    my ($lvl, $s) = @_;
    return if $lvl > $DEBUG;
    print scalar(localtime) . " [$$] ($lvl) $s\n";
}