delinquent_files/matcher.inc

Thu, 22 Feb 2024 20:21:22 +0100

author
Peter Gervai <grin@grin.hu>
date
Thu, 22 Feb 2024 20:21:22 +0100
changeset 8
38415be9f910
parent 0
3b714bbb1347
permissions
-rw-r--r--

Skipping {{nobots}} pages

0
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
1 <?php
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
2
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
3 ## This file contains the regex replacement functions.
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
4 ## It is separated to be testable, by test_regex.php.
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
5 ## This has been written by Peter 'grin' Gervai about 2022.
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
6 ## Licensed under GPLv3+ and CC_By_Sa-4.0.
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
7 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
8 ## $Id$
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
9 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
10
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
11 class Matcher {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
12 private $d;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
13
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
14 function __construct($debug) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
15 # ask for the debug framework object
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
16 $this->d = $debug;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
17 $this->d->msg("Matcher debug initialized");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
18 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
19
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
20 ## prepare a filename regex pattern from filename
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
21 function matcher_prepare_pattern( $file ) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
22 $first_letter = substr ( $file , 0 , 1 ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
23 $pattern = substr ( $file , 1 ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
24 # If first letter have upper/lowercase include both as [Aa]
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
25 if ( mb_strtoupper($first_letter) != mb_strtolower($first_letter) ) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
26 $first_letter = "[" . mb_strtoupper($first_letter) . mb_strtolower($first_letter) . "]" ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
27 } else {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
28 # escape special characters and also '/'
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
29 $first_letter = preg_quote ( $first_letter , '/' ) ; # can be metacharacter
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
30 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
31 ## normalise mediawiki filenames: _ to space, first letter anycase, then space to [_ ]
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
32 $pattern = str_replace ( '_' , ' ' , $pattern ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
33 $pattern = $first_letter . preg_quote ( $pattern, '/' ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
34 $pattern = str_replace ( ' ' , '[_ ]' , $pattern ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
35 return $pattern;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
36 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
37
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
38 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
39 ## remove the filename from various places in the text body
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
40 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
41 function matcher_do_unlink( $text, $pattern ) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
42 $new_text = $text;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
43
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
44 # unicode \w
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
45 $w='[\pL\pM]';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
46 # unicode \s
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
47 $s='\pZ';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
48 # word end separator (instead of \b, but a zero-width assertion would be nicer)
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
49 $we='(?=[^\pL\pM\n]|$)';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
50
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
51 # filename: " : Image : name.ext"
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
52 ### 20220523- request not to remove [[:File:....]] -g
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
53 ###$pattern_file = "$s*(: *)?$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
54 $pattern_file = "$s*$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
55 # filename in galleries (leading : cannot stand w/o namespace)
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
56 $pattern_gfile= "((: *)?$w+ *: *)?$pattern$we" ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
57 # links
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
58 # [[ : image : foo.jpg | pip=pop | flip [[flop]] [http://example.com x] [[zig]] zag ]]
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
59 $pattern_link = '\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]";
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
60 # if we had to remove the whole line, eat LF, too.
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
61 $pattern_link_wholeline = '^\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]$s*\\n";
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
62 # gallery entries
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
63 $pattern_gallery = '\n?^' . $s .'*'. $pattern_gfile .'[^\n]*?((?<gal><\/gallery *>)|$)' ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
64 # plain gallery entry (not used now)
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
65 $pattern_gallery2 = '\n?^'. $s .'*'. $pattern .'[ \t]*\|[^\n]*$' ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
66 # files within templates
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
67 $pattern_template = '= *' . $pattern_gfile . ' *';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
68
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
69 $this->d->trace(" PatternLink WL : $pattern_link_wholeline");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
70 $this->d->trace(" PatternLink : $pattern_link");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
71 $this->d->trace(" PatternGallery : $pattern_gallery");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
72 $this->d->trace(" PatternTemplate: $pattern_template");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
73
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
74 # in normal link (non-multiline pattern)
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
75 # if we have to remove the whole line, do it first
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
76 $new_text = preg_replace ( "/$pattern_link_wholeline/um" , '' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
77 $this->d->trace("Text after link replacement (wholeline): \n>>>$new_text<<<");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
78 # otherwise leave one space to keep word separation
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
79 $new_text = preg_replace ( "/ *$pattern_link */u" , ' ' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
80 $this->d->trace("Text after link replacement: \n>>>$new_text<<<");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
81 # in gallery
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
82 #$new_text = preg_replace ( "/$pattern_gallery/um" , '' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
83 $new_text = preg_replace_callback ( "/$pattern_gallery/um",
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
84 function ($matches) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
85 # original if no match (doesn't get called), ${gal} if group match, empty if doesn't
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
86 if( array_key_exists( 'gal', $matches ) ) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
87 return $matches['gal'];
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
88 } else {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
89 return '';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
90 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
91 },
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
92 $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
93
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
94 $this->d->trace("Text after gallery replacement: \n>>>$new_text<<<");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
95 # $new_text = preg_replace ( "/$pattern_gallery2/um" , '' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
96 # ?
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
97 # $new_text = preg_replace ( "/ *$pattern_file */u" , ' ' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
98 # in template
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
99 $new_text = preg_replace ( "/$pattern_template/um" , '=' , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
100 $this->d->trace("Text after template replacement: \n>>>$new_text<<<");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
101
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
102 return $new_text;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
103 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
104
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
105 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
106 ## replace file, don't care much about the context
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
107 ##
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
108 function matcher_do_replacement( $text, $pattern, $replacement_file ) {
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
109 $new_text = $text;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
110 # there is no mb_ucfirst
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
111 $new_file = ucfirst ( trim ( str_replace ( '_' , ' ' , $replacement_file ) ) ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
112 $pattern = '(?<=^|[^\pL\pM\n])'.$pattern.'(?=$|[^\pL\pM])';
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
113 $this->d->trace(" ReplMatch: $pattern");
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
114 $new_text = preg_replace ( "/$pattern/um" , $new_file , $new_text ) ;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
115 return $new_text;
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
116 }
3b714bbb1347 Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff changeset
117 }

mercurial