Sat, 17 Feb 2024 15:36:09 +0100
demon.php: ease some db pressure by not querying old entries (7 days).
- replaced some rejects workaround by exclusions
- some moar debug msgs
<?php ## This file contains the regex replacement functions. ## It is separated to be testable, by test_regex.php. ## This has been written by Peter 'grin' Gervai about 2022. ## Licensed under GPLv3+ and CC_By_Sa-4.0. ## ## $Id$ ## class Matcher { private $d; function __construct($debug) { # ask for the debug framework object $this->d = $debug; $this->d->msg("Matcher debug initialized"); } ## prepare a filename regex pattern from filename function matcher_prepare_pattern( $file ) { $first_letter = substr ( $file , 0 , 1 ) ; $pattern = substr ( $file , 1 ) ; # If first letter have upper/lowercase include both as [Aa] if ( mb_strtoupper($first_letter) != mb_strtolower($first_letter) ) { $first_letter = "[" . mb_strtoupper($first_letter) . mb_strtolower($first_letter) . "]" ; } else { # escape special characters and also '/' $first_letter = preg_quote ( $first_letter , '/' ) ; # can be metacharacter } ## normalise mediawiki filenames: _ to space, first letter anycase, then space to [_ ] $pattern = str_replace ( '_' , ' ' , $pattern ) ; $pattern = $first_letter . preg_quote ( $pattern, '/' ) ; $pattern = str_replace ( ' ' , '[_ ]' , $pattern ) ; return $pattern; } ## ## remove the filename from various places in the text body ## function matcher_do_unlink( $text, $pattern ) { $new_text = $text; # unicode \w $w='[\pL\pM]'; # unicode \s $s='\pZ'; # word end separator (instead of \b, but a zero-width assertion would be nicer) $we='(?=[^\pL\pM\n]|$)'; # filename: " : Image : name.ext" ### 20220523- request not to remove [[:File:....]] -g ###$pattern_file = "$s*(: *)?$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg $pattern_file = "$s*$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg # filename in galleries (leading : cannot stand w/o namespace) $pattern_gfile= "((: *)?$w+ *: *)?$pattern$we" ; # links # [[ : image : foo.jpg | pip=pop | flip [[flop]] [http://example.com x] [[zig]] zag ]] $pattern_link = '\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]"; # if we had to remove the whole line, eat LF, too. $pattern_link_wholeline = '^\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]$s*\\n"; # gallery entries $pattern_gallery = '\n?^' . $s .'*'. $pattern_gfile .'[^\n]*?((?<gal><\/gallery *>)|$)' ; # plain gallery entry (not used now) $pattern_gallery2 = '\n?^'. $s .'*'. $pattern .'[ \t]*\|[^\n]*$' ; # files within templates $pattern_template = '= *' . $pattern_gfile . ' *'; $this->d->trace(" PatternLink WL : $pattern_link_wholeline"); $this->d->trace(" PatternLink : $pattern_link"); $this->d->trace(" PatternGallery : $pattern_gallery"); $this->d->trace(" PatternTemplate: $pattern_template"); # in normal link (non-multiline pattern) # if we have to remove the whole line, do it first $new_text = preg_replace ( "/$pattern_link_wholeline/um" , '' , $new_text ) ; $this->d->trace("Text after link replacement (wholeline): \n>>>$new_text<<<"); # otherwise leave one space to keep word separation $new_text = preg_replace ( "/ *$pattern_link */u" , ' ' , $new_text ) ; $this->d->trace("Text after link replacement: \n>>>$new_text<<<"); # in gallery #$new_text = preg_replace ( "/$pattern_gallery/um" , '' , $new_text ) ; $new_text = preg_replace_callback ( "/$pattern_gallery/um", function ($matches) { # original if no match (doesn't get called), ${gal} if group match, empty if doesn't if( array_key_exists( 'gal', $matches ) ) { return $matches['gal']; } else { return ''; } }, $new_text ) ; $this->d->trace("Text after gallery replacement: \n>>>$new_text<<<"); # $new_text = preg_replace ( "/$pattern_gallery2/um" , '' , $new_text ) ; # ? # $new_text = preg_replace ( "/ *$pattern_file */u" , ' ' , $new_text ) ; # in template $new_text = preg_replace ( "/$pattern_template/um" , '=' , $new_text ) ; $this->d->trace("Text after template replacement: \n>>>$new_text<<<"); return $new_text; } ## ## replace file, don't care much about the context ## function matcher_do_replacement( $text, $pattern, $replacement_file ) { $new_text = $text; # there is no mb_ucfirst $new_file = ucfirst ( trim ( str_replace ( '_' , ' ' , $replacement_file ) ) ) ; $pattern = '(?<=^|[^\pL\pM\n])'.$pattern.'(?=$|[^\pL\pM])'; $this->d->trace(" ReplMatch: $pattern"); $new_text = preg_replace ( "/$pattern/um" , $new_file , $new_text ) ; return $new_text; } }