delinquent_files/matcher.inc

Sat, 17 Feb 2024 15:36:09 +0100

author
Peter Gervai <grin@grin.hu>
date
Sat, 17 Feb 2024 15:36:09 +0100
changeset 6
8b4e205ffc94
parent 0
3b714bbb1347
permissions
-rw-r--r--

demon.php: ease some db pressure by not querying old entries (7 days).
- replaced some rejects workaround by exclusions
- some moar debug msgs

<?php

## This file contains the regex replacement functions.
## It is separated to be testable, by test_regex.php. 
## This has been written by Peter 'grin' Gervai about 2022.
## Licensed under GPLv3+ and CC_By_Sa-4.0. 
##
## $Id$
##

class Matcher {
    private $d;

    function __construct($debug) {
        # ask for the debug framework object
        $this->d = $debug;
        $this->d->msg("Matcher debug initialized");
    }

    ## prepare a filename regex pattern from filename
    function matcher_prepare_pattern( $file ) {
        $first_letter = substr ( $file , 0 , 1 ) ;
        $pattern = substr ( $file , 1 ) ;
        # If first letter have upper/lowercase include both as [Aa]
        if ( mb_strtoupper($first_letter) != mb_strtolower($first_letter) ) {
            $first_letter = "[" . mb_strtoupper($first_letter) . mb_strtolower($first_letter) . "]" ;
        } else {
            # escape special characters and also '/'
            $first_letter = preg_quote ( $first_letter , '/' ) ; # can be metacharacter
        }
        ## normalise mediawiki filenames: _ to space, first letter anycase, then space to [_ ]
        $pattern = str_replace ( '_' , ' ' , $pattern ) ;
        $pattern = $first_letter . preg_quote ( $pattern, '/' ) ;
        $pattern = str_replace ( ' ' , '[_ ]' , $pattern ) ;
        return $pattern;
    }

    ##
    ## remove the filename from various places in the text body
    ##
    function matcher_do_unlink( $text, $pattern ) {
        $new_text = $text;

        # unicode \w
        $w='[\pL\pM]';
        # unicode \s
        $s='\pZ';
        # word end separator (instead of \b, but a zero-width assertion would be nicer)
        $we='(?=[^\pL\pM\n]|$)';
        
        # filename: " : Image : name.ext"
        ### 20220523- request not to remove [[:File:....]] -g
        ###$pattern_file = "$s*(: *)?$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg
        $pattern_file = "$s*$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg
        # filename in galleries (leading : cannot stand w/o namespace)
        $pattern_gfile= "((: *)?$w+ *: *)?$pattern$we" ; 
        # links
        # [[ : image : foo.jpg | pip=pop | flip [[flop]] [http://example.com x] [[zig]] zag ]]
        $pattern_link = '\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]";
        # if we had to remove the whole line, eat LF, too.
        $pattern_link_wholeline = '^\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]$s*\\n";
        # gallery entries
        $pattern_gallery = '\n?^' . $s .'*'. $pattern_gfile .'[^\n]*?((?<gal><\/gallery *>)|$)' ;
        # plain gallery entry (not used now)
        $pattern_gallery2 = '\n?^'. $s .'*'. $pattern .'[ \t]*\|[^\n]*$' ;
        # files within templates
        $pattern_template = '= *' . $pattern_gfile . ' *';

        $this->d->trace(" PatternLink WL : $pattern_link_wholeline");
        $this->d->trace(" PatternLink    : $pattern_link");
        $this->d->trace(" PatternGallery : $pattern_gallery");
        $this->d->trace(" PatternTemplate: $pattern_template");

        # in normal link (non-multiline pattern)
        #  if we have to remove the whole line, do it first
        $new_text = preg_replace ( "/$pattern_link_wholeline/um" , '' , $new_text ) ;
        $this->d->trace("Text after link replacement (wholeline): \n>>>$new_text<<<");
        #  otherwise leave one space to keep word separation
        $new_text = preg_replace ( "/ *$pattern_link */u" , ' ' , $new_text ) ;
        $this->d->trace("Text after link replacement: \n>>>$new_text<<<");
        # in gallery
        #$new_text = preg_replace ( "/$pattern_gallery/um" , '' , $new_text ) ;
        $new_text = preg_replace_callback ( "/$pattern_gallery/um",
                function ($matches) {
                    # original if no match (doesn't get called), ${gal} if group match, empty if doesn't
                    if( array_key_exists( 'gal', $matches ) ) {
                        return $matches['gal'];
                    } else {
                        return '';
                    }
                }, 
                $new_text ) ;
        
        $this->d->trace("Text after gallery replacement: \n>>>$new_text<<<");
    #    $new_text = preg_replace ( "/$pattern_gallery2/um" , '' , $new_text ) ;
        # ?
    #    $new_text = preg_replace ( "/ *$pattern_file */u" , ' ' , $new_text ) ;
        # in template
        $new_text = preg_replace ( "/$pattern_template/um" , '=' , $new_text ) ;
        $this->d->trace("Text after template replacement: \n>>>$new_text<<<");
        
        return $new_text;
    }

    ##
    ## replace file, don't care much about the context
    ##
    function matcher_do_replacement( $text, $pattern, $replacement_file ) {
        $new_text = $text;
        # there is no mb_ucfirst
        $new_file = ucfirst ( trim ( str_replace ( '_' , ' ' , $replacement_file ) ) ) ;
        $pattern = '(?<=^|[^\pL\pM\n])'.$pattern.'(?=$|[^\pL\pM])';
        $this->d->trace(" ReplMatch: $pattern"); 
        $new_text = preg_replace ( "/$pattern/um" , $new_file , $new_text ) ;
        return $new_text;
    }
}

mercurial