Thu, 22 Feb 2024 20:21:22 +0100
Skipping {{nobots}} pages
0
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
1 | <?php |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
2 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
3 | ## This file contains the regex replacement functions. |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
4 | ## It is separated to be testable, by test_regex.php. |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
5 | ## This has been written by Peter 'grin' Gervai about 2022. |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
6 | ## Licensed under GPLv3+ and CC_By_Sa-4.0. |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
7 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
8 | ## $Id$ |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
9 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
10 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
11 | class Matcher { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
12 | private $d; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
13 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
14 | function __construct($debug) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
15 | # ask for the debug framework object |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
16 | $this->d = $debug; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
17 | $this->d->msg("Matcher debug initialized"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
18 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
19 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
20 | ## prepare a filename regex pattern from filename |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
21 | function matcher_prepare_pattern( $file ) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
22 | $first_letter = substr ( $file , 0 , 1 ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
23 | $pattern = substr ( $file , 1 ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
24 | # If first letter have upper/lowercase include both as [Aa] |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
25 | if ( mb_strtoupper($first_letter) != mb_strtolower($first_letter) ) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
26 | $first_letter = "[" . mb_strtoupper($first_letter) . mb_strtolower($first_letter) . "]" ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
27 | } else { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
28 | # escape special characters and also '/' |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
29 | $first_letter = preg_quote ( $first_letter , '/' ) ; # can be metacharacter |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
30 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
31 | ## normalise mediawiki filenames: _ to space, first letter anycase, then space to [_ ] |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
32 | $pattern = str_replace ( '_' , ' ' , $pattern ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
33 | $pattern = $first_letter . preg_quote ( $pattern, '/' ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
34 | $pattern = str_replace ( ' ' , '[_ ]' , $pattern ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
35 | return $pattern; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
36 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
37 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
38 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
39 | ## remove the filename from various places in the text body |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
40 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
41 | function matcher_do_unlink( $text, $pattern ) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
42 | $new_text = $text; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
43 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
44 | # unicode \w |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
45 | $w='[\pL\pM]'; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
46 | # unicode \s |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
47 | $s='\pZ'; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
48 | # word end separator (instead of \b, but a zero-width assertion would be nicer) |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
49 | $we='(?=[^\pL\pM\n]|$)'; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
50 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
51 | # filename: " : Image : name.ext" |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
52 | ### 20220523- request not to remove [[:File:....]] -g |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
53 | ###$pattern_file = "$s*(: *)?$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
54 | $pattern_file = "$s*$w+ *: *$pattern" ; # e.g. File:x.jog, T???p_tin:x.jpg |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
55 | # filename in galleries (leading : cannot stand w/o namespace) |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
56 | $pattern_gfile= "((: *)?$w+ *: *)?$pattern$we" ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
57 | # links |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
58 | # [[ : image : foo.jpg | pip=pop | flip [[flop]] [http://example.com x] [[zig]] zag ]] |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
59 | $pattern_link = '\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]"; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
60 | # if we had to remove the whole line, eat LF, too. |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
61 | $pattern_link_wholeline = '^\[\[ *' . $pattern_file . "(\[\[.*?\]\]$w*|\[.*?\]|[^\pL\pM\\n\]].*?)*\]\]$s*\\n"; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
62 | # gallery entries |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
63 | $pattern_gallery = '\n?^' . $s .'*'. $pattern_gfile .'[^\n]*?((?<gal><\/gallery *>)|$)' ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
64 | # plain gallery entry (not used now) |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
65 | $pattern_gallery2 = '\n?^'. $s .'*'. $pattern .'[ \t]*\|[^\n]*$' ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
66 | # files within templates |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
67 | $pattern_template = '= *' . $pattern_gfile . ' *'; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
68 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
69 | $this->d->trace(" PatternLink WL : $pattern_link_wholeline"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
70 | $this->d->trace(" PatternLink : $pattern_link"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
71 | $this->d->trace(" PatternGallery : $pattern_gallery"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
72 | $this->d->trace(" PatternTemplate: $pattern_template"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
73 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
74 | # in normal link (non-multiline pattern) |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
75 | # if we have to remove the whole line, do it first |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
76 | $new_text = preg_replace ( "/$pattern_link_wholeline/um" , '' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
77 | $this->d->trace("Text after link replacement (wholeline): \n>>>$new_text<<<"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
78 | # otherwise leave one space to keep word separation |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
79 | $new_text = preg_replace ( "/ *$pattern_link */u" , ' ' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
80 | $this->d->trace("Text after link replacement: \n>>>$new_text<<<"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
81 | # in gallery |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
82 | #$new_text = preg_replace ( "/$pattern_gallery/um" , '' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
83 | $new_text = preg_replace_callback ( "/$pattern_gallery/um", |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
84 | function ($matches) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
85 | # original if no match (doesn't get called), ${gal} if group match, empty if doesn't |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
86 | if( array_key_exists( 'gal', $matches ) ) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
87 | return $matches['gal']; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
88 | } else { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
89 | return ''; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
90 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
91 | }, |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
92 | $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
93 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
94 | $this->d->trace("Text after gallery replacement: \n>>>$new_text<<<"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
95 | # $new_text = preg_replace ( "/$pattern_gallery2/um" , '' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
96 | # ? |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
97 | # $new_text = preg_replace ( "/ *$pattern_file */u" , ' ' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
98 | # in template |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
99 | $new_text = preg_replace ( "/$pattern_template/um" , '=' , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
100 | $this->d->trace("Text after template replacement: \n>>>$new_text<<<"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
101 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
102 | return $new_text; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
103 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
104 | |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
105 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
106 | ## replace file, don't care much about the context |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
107 | ## |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
108 | function matcher_do_replacement( $text, $pattern, $replacement_file ) { |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
109 | $new_text = $text; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
110 | # there is no mb_ucfirst |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
111 | $new_file = ucfirst ( trim ( str_replace ( '_' , ' ' , $replacement_file ) ) ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
112 | $pattern = '(?<=^|[^\pL\pM\n])'.$pattern.'(?=$|[^\pL\pM])'; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
113 | $this->d->trace(" ReplMatch: $pattern"); |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
114 | $new_text = preg_replace ( "/$pattern/um" , $new_file , $new_text ) ; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
115 | return $new_text; |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
116 | } |
3b714bbb1347
Add files without passwords and other unwanted fluff.
Peter Gervai <grin@grin.hu>
parents:
diff
changeset
|
117 | } |