comparison dcclib/ckfuz2.c @ 0:c7f6b056b673

First import of vendor version
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 13:49:58 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c7f6b056b673
1 /* Distributed Checksum Clearinghouse
2 *
3 * compute fuzzy body checksum #2
4 *
5 * Copyright (c) 2008 by Rhyolite Software, LLC
6 *
7 * This agreement is not applicable to any entity which sells anti-spam
8 * solutions to others or provides an anti-spam solution as part of a
9 * security solution sold to other entities, or to a private network
10 * which employs the DCC or uses data provided by operation of the DCC
11 * but does not provide corresponding data to other users.
12 *
13 * Permission to use, copy, modify, and distribute this software without
14 * changes for any purpose with or without fee is hereby granted, provided
15 * that the above copyright notice and this permission notice appear in all
16 * copies and any distributed versions or copies are either unchanged
17 * or not called anything similar to "DCC" or "Distributed Checksum
18 * Clearinghouse".
19 *
20 * Parties not eligible to receive a license under this agreement can
21 * obtain a commercial license to use DCC by contacting Rhyolite Software
22 * at sales@rhyolite.com.
23 *
24 * A commercial license would be for Distributed Checksum and Reputation
25 * Clearinghouse software. That software includes additional features. This
26 * free license for Distributed ChecksumClearinghouse Software does not in any
27 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
28 * software
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
31 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
33 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
34 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
35 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
36 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
37 * SOFTWARE.
38 *
39 * Rhyolite Software DCC 1.3.103-1.52 $Revision$
40 */
41
42 #include "dcc_ck.h"
43
44 #include "ckfuz2_tbl.h"
45 struct {
46 const char **words;
47 u_int len;
48 const u_char *cset;
49 } tbls[FUZ2_LAN_NUM] = {
50 {word_tbl0, word_tbl0_LEN, 0},
51 {word_tbl1, word_tbl1_LEN, 0},
52 {word_tbl2, word_tbl2_LEN, dcc_cset_2},
53 };
54
55
56 #define FZ2 cks->fuz2
57
58 #define BUF_LEN 1024
59 typedef struct {
60 char buf[BUF_LEN+sizeof(DCC_FUZ2_WORD)+1];
61 int blen;
62 } LBUF;
63
64
65 #ifdef DCC_DEBUG_CKSUM
66 #define FUZ2(lp, b, l) (dcc_clnt_debug == 5 \
67 ? (write(1, b, l), MD5Update(&lp->md5, b, l)) \
68 : MD5Update(&lp->md5, b, l))
69 #else
70 #define FUZ2(lp, b, l) MD5Update(&lp->md5, b, l)
71 #endif
72
73
74 void
75 dcc_ck_fuz2_init(DCC_GOT_CKS *cks)
76 {
77 FUZ2_LANG *lp;
78
79 FZ2.wlen = 0;
80 DCC_FUZ2_WORD_CLEAR(&FZ2.w);
81 FZ2.st = DCC_FUZ2_ST_WORD;
82 FZ2.url_cp = FZ2.url_buf;
83 FZ2.urls = 0;
84
85 cks->sums[DCC_CK_FUZ2].type = DCC_CK_FUZ2;
86
87 FZ2.btotal = 0;
88 FZ2.xsummed = 0;
89 for (lp = FZ2.lang; lp <= LAST(FZ2.lang); ++lp) {
90 lp->wsummed = 0;
91 lp->wtotal = 0;
92 MD5Init(&lp->md5);
93 }
94 }
95
96
97
98 static inline u_char /* 1=found it, 0=not a known word */
99 lookup_word(const DCC_FUZ2_WORD *w, u_int wlen,
100 const char **word_tbl, u_int word_tbl_len)
101 {
102 const char *p;
103 u_int n;
104
105 p = word_tbl[fuz2_word_hash(w, word_tbl_len)];
106 if (!p)
107 return 0;
108 for (;;) {
109 n = *p++;
110 if (!n)
111 return 0;
112 if (n == wlen && !memcmp(w->b, p, n))
113 return 1;
114 p += n;
115 }
116 }
117
118
119
120 static void
121 add_word(DCC_GOT_CKS *cks, LBUF *lbp)
122 {
123 FUZ2_LANG *lp;
124 int tbl;
125
126 tbl = 0;
127 for (lp = FZ2.lang; lp < &FZ2.lang[FUZ2_LAN_NUM]; ++lp, ++lbp, ++tbl) {
128 if (tbls[tbl].cset != 0
129 && tbls[tbl].cset != cks->mime_cset)
130 continue;
131 if (lookup_word(&FZ2.w, FZ2.wlen,
132 tbls[tbl].words, tbls[tbl].len)) {
133 ++lp->wtotal;
134 memcpy(&lbp->buf[lbp->blen], &FZ2.w, FZ2.wlen);
135 if ((lbp->blen += FZ2.wlen) >= BUF_LEN) {
136 lp->wsummed += lbp->blen;
137 FUZ2(lp, lbp->buf, lbp->blen);
138 lbp->blen = 0;
139 }
140 }
141 }
142 }
143
144
145
146 /* resolve an HTML characater reference */
147 static u_char /* 0 or found value */
148 lookup_cref(DCC_FUZ2_WORD *w, u_int clen)
149 {
150 const char *p;
151 u_int n;
152
153 if (clen > sizeof(DCC_FUZ2_WORD)
154 || clen == 0)
155 return 0;
156 p = cref_tbl[fuz2_word_hash(w, cref_tbl_LEN)];
157 if (!p)
158 return 0;
159 for (;;) {
160 n = *p++;
161 if (!n)
162 return 0;
163 if (n == clen && !memcmp(w->b, p, n))
164 return p[clen];
165 p += n+1;
166 }
167 }
168
169
170
171 static int
172 adv_cref(DCC_GOT_CKS *cks, u_char c)
173 {
174 switch (FZ2.cref_st) {
175 case DCC_CREF_ST_IDLE:
176 dcc_logbad(EX_SOFTWARE, "impossible fuz2 cref state");
177 break;
178
179 case DCC_CREF_ST_START:
180 /* start to get a possible HTML character reference
181 * We have already passed the '&' */
182 if (c == '#') {
183 FZ2.cref_st = DCC_CREF_ST_NUM;
184 } else if ((c >= 'a' && c <= 'z')
185 || (c >= 'A' && c <= 'Z')) {
186 DCC_FUZ2_WORD_CLEAR(&FZ2.cref_w);
187 FZ2.cref_w.b[0] = c;
188 FZ2.cref_cnt = 1;
189 FZ2.cref_st = DCC_CREF_ST_NAME;
190 } else {
191 FZ2.cref_st = DCC_CREF_ST_IDLE;
192 return -2;
193 }
194 break;
195
196 case DCC_CREF_ST_NUM:
197 /* look for 'x' or the first digit */
198 if (c == 'x' || c == 'X') {
199 FZ2.cref_st = DCC_CREF_ST_HEX;
200 FZ2.cref_cnt = 0;
201 } else if (c >= '0' && c <= '9') {
202 FZ2.cref_cnt = c - '0';
203 FZ2.cref_st = DCC_CREF_ST_DEC;
204 } else {
205 FZ2.cref_st = DCC_CREF_ST_IDLE;
206 return -2;
207 }
208 break;
209
210 case DCC_CREF_ST_DEC:
211 if (c >= '0' && c <= '9') {
212 FZ2.cref_cnt = FZ2.cref_cnt*10 + (c - '0');
213 } else if (cks->mime_cset[c] == FC_SP) {
214 FZ2.cref_st = DCC_CREF_ST_IDLE;
215 return c == ';';
216 } else {
217 FZ2.cref_st = DCC_CREF_ST_IDLE;
218 return -2;
219 }
220 if (FZ2.cref_cnt > 256)
221 FZ2.cref_cnt = 256;
222 ++FZ2.btotal;
223 break;
224
225 case DCC_CREF_ST_HEX:
226 if ((c >= 'a' && c <= 'f')
227 || (c >= 'A' && c <= 'F')) {
228 FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c & 0xf) + 9;
229 } else if (c >= '0' && c <= '9') {
230 FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c - '0');
231 } else if (cks->mime_cset[c] == FC_SP) {
232 FZ2.cref_st = DCC_CREF_ST_IDLE;
233 return c == ';';
234 } else {
235 FZ2.cref_st = DCC_CREF_ST_IDLE;
236 return -2;
237 }
238 if (FZ2.cref_cnt > 256)
239 FZ2.cref_cnt = 256;
240 ++FZ2.btotal;
241 break;
242
243 case DCC_CREF_ST_NAME:
244 if ((c >= 'a' && c <= 'z')
245 || (c >= 'A' && c <= 'Z')) {
246 /* If the word is too long, the final match will fail.
247 * This will consume it */
248 if (FZ2.cref_cnt < sizeof(DCC_FUZ2_WORD))
249 FZ2.cref_w.b[FZ2.cref_cnt++] = c;
250 } else if (cks->mime_cset[c] == FC_SP) {
251 /* this character ends the cref */
252 FZ2.cref_cnt = lookup_cref(&FZ2.cref_w, FZ2.cref_cnt);
253 FZ2.cref_st = DCC_CREF_ST_IDLE;
254 return c == ';';
255 } else {
256 FZ2.cref_st = DCC_CREF_ST_IDLE;
257 return -2;
258 }
259 break;
260 }
261 return -1;
262 }
263
264
265 void
266 dcc_ck_fuz2(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
267 {
268 #define SKIP_WORD() (FZ2.wlen = sizeof(DCC_FUZ2_WORD)+1)
269 #define JUNK() (SKIP_WORD(), FZ2.st = DCC_FUZ2_ST_WORD)
270 LBUF *lbp, lbufs[FUZ2_LAN_NUM];
271 FUZ2_LANG *lp;
272 int i;
273 char *p;
274 u_char c;
275
276 if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
277 return;
278
279 for (lbp = lbufs; lbp <= LAST(lbufs); ++lbp)
280 lbp->blen = 0;
281
282 while (bp_len != 0) {
283 switch (FZ2.st) {
284 case DCC_FUZ2_ST_WORD:
285 /* gathering a word */
286 do {
287 if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
288 --bp_len;
289 c = *bp++;
290 c = cks->mime_cset[c];
291 if (c == FC_CF) {
292 if (cks->mime_ct == DCC_CK_CT_HTML){
293 FZ2.cref_st = DCC_CREF_ST_START;
294 break;
295 }
296 c = FC_SP;
297 }
298 } else {
299 i = adv_cref(cks, *bp);
300 if (i == -2) {
301 JUNK(); /* bogus cref */
302 continue;
303 }
304 if (i < 0) { /* get more of cref */
305 --bp_len;
306 ++bp;
307 continue;
308 }
309 bp += i; /* use complete cref */
310 bp_len -= i;
311 c = cks->mime_cset[FZ2.cref_cnt];
312
313 if (c == FC_SK)
314 continue; /* ignore accent mark */
315 }
316
317 if (c >= FC_A) {
318 ++FZ2.btotal;
319 if (FZ2.wlen < sizeof(DCC_FUZ2_WORD))
320 FZ2.w.b[FZ2.wlen++] = c;
321 else
322 SKIP_WORD();
323 continue;
324 }
325
326 if (c == FC_SP) {
327 if (FZ2.wlen >= MIN_WLEN
328 && FZ2.wlen <=sizeof(DCC_FUZ2_WORD))
329 add_word(cks, lbufs);
330 FZ2.wlen = 0;
331 DCC_FUZ2_WORD_CLEAR(&FZ2.w);
332 continue;
333 }
334 ++FZ2.btotal;
335
336 if (c == FC_LT) {
337 FZ2.tag_len = 0;
338 DCC_FUZ2_WORD_CLEAR(&FZ2.tag);
339 FZ2.st = DCC_FUZ2_ST_START_TAG;
340 break;
341 }
342
343 JUNK();
344 } while (bp_len != 0);
345 break;
346
347 case DCC_FUZ2_ST_START_TAG:
348 /* collecting an HTML tag or comment
349 * We've passed the '<' */
350 c = *bp;
351 #define SAVE_TAG(_c) (FZ2.tag.b[FZ2.tag_len++] = _c, \
352 ++FZ2.btotal, ++bp, --bp_len)
353 if (((c >= 'a' && c <= 'z') /* tag */
354 || (c >= '0' && c <= '9'))
355 && FZ2.tag_len < sizeof(FZ2.tag)) {
356 SAVE_TAG(c);
357 break;
358 }
359 if (c >= 'A' && c <= 'Z'
360 && FZ2.tag_len < sizeof(FZ2.tag)) {
361 SAVE_TAG(c - ('A'-'a'));
362 break;
363 }
364 if ((c == '/' /* end-tag */
365 || c == '!') /* start of comment */
366 && FZ2.tag_len == 0) {
367 SAVE_TAG(c);
368 break;
369 }
370 if (c == '-' /* comment */
371 && FZ2.tag_len >= 1 && FZ2.tag_len <= 2) {
372 SAVE_TAG(c);
373 break;
374 }
375 #undef SAVE_TAG
376
377 /* notice an <html> tag while in text/plain
378 * and switch to text/html */
379 if (FZ2.tag_len == 4
380 && cks->mime_ct != DCC_CK_CT_HTML
381 && !memcmp(FZ2.tag.b, "html", 4))
382 cks->mime_ct = DCC_CK_CT_HTML;
383
384 if (cks->mime_ct == DCC_CK_CT_HTML
385 && FZ2.tag_len > 0) {
386 /* if we are in an HTML document and we
387 * have at least one character after '<',
388 * assume it is some kind of HTML tag */
389 FZ2.xsummed += FZ2.tag_len+1; /* count '<' */
390 if (c == '>') {
391 /* optimize common simple tags */
392 ++FZ2.xsummed;
393 ++FZ2.btotal;
394 ++bp, --bp_len;
395 FZ2.st = DCC_FUZ2_ST_WORD;
396 break;
397 }
398 if (FZ2.tag_len >= 3
399 && !memcmp(FZ2.tag.b, "!--", 3)) {
400 FZ2.st = DCC_FUZ2_ST_SKIP_COMMENT;
401 } else {
402 FZ2.url.st = DCC_URL_ST_IDLE;
403 FZ2.st = DCC_FUZ2_ST_SKIP_TAG;
404 }
405 } else {
406 /* assume it is not an HTML tag and
407 * mark the whole word as junk */
408 JUNK();
409 }
410 break;
411
412 case DCC_FUZ2_ST_SKIP_TAG:
413 /* Skip rest of boring HTML tag
414 * We ought to ignore '>' in quotes */
415 do {
416 if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
417 --bp_len;
418 c = *bp++;
419 if (c == FC_CF) {
420 FZ2.cref_st = DCC_CREF_ST_START;
421 continue;
422 }
423 } else {
424 i = adv_cref(cks, *bp);
425 if (i == -2)
426 continue;
427 if (i < 0) { /* get more of cref */
428 --bp_len;
429 ++bp;
430 continue;
431 }
432 bp += i; /* use complete cref */
433 bp_len -= i;
434 c = FZ2.cref_cnt;
435 }
436
437 /* capture URLs */
438 i = dcc_ck_url(&FZ2.url, c, &FZ2.url_cp);
439 c = i>>DCC_CK_URL_SHIFT;
440 switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) {
441 case DCC_CK_URL_CHAR:
442 break;
443 case DCC_CK_URL_CK_LEN:
444 /* Make room before starting a URL
445 * if we are too close to
446 * end of buffer for a maximum size URL.
447 * Discard the first URL in the buffer.
448 * This relies on dcc_ck_url() limiting
449 * the URL to DCC_URL_MAX bytes */
450 while (FZ2.url_cp
451 >= &FZ2.url_buf[ISZ(FZ2.url_buf)
452 - DCC_FUZ2_URL_MAX]) {
453 p = memchr(FZ2.url_buf, '\0',
454 FZ2.url_cp-FZ2.url_buf);
455 if (!p) {
456 /* if this was the first URL,
457 * discard half of it */
458 p = &FZ2.url_buf[DCC_URL_MAX/2];
459 } else {
460 ++p;
461 }
462 memmove(FZ2.url_buf, p,
463 FZ2.url_cp - p);
464 FZ2.url_cp -= p - FZ2.url_buf;
465 }
466 if (FZ2.url_cp != FZ2.url_buf)
467 *FZ2.url_cp++ = '\0';
468 ++FZ2.urls;
469 break;
470 case DCC_CK_URL_HOST:
471 case DCC_CK_URL_DOT:
472 if (FZ2.url_cp
473 < &FZ2.url_buf[ISZ(FZ2.url_buf)])
474 *FZ2.url_cp++ = c;
475 continue;
476 case DCC_CK_URL_HOST_END:
477 case DCC_CK_URL_HOST_RESET:
478 case DCC_CK_URL_SKIP:
479 continue;
480 }
481
482 if (c == '>') {
483 ++FZ2.xsummed;
484 ++FZ2.btotal;
485 FZ2.st = DCC_FUZ2_ST_WORD;
486 break;
487 }
488 if (cks->mime_cset[c] != FC_SP) {
489 ++FZ2.xsummed;
490 ++FZ2.btotal;
491 /* don't let wild tags run forever */
492 if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
493 JUNK();
494 break;
495 }
496 }
497 } while (bp_len != 0);
498 break;
499
500 case DCC_FUZ2_ST_SKIP_COMMENT:
501 /* HTML comments can include HTML tags,
502 * but spammers don't understand HTML comment syntax
503 * and Netscape and IE treat (and ignore) broken
504 * comments like strange tags. */
505 do {
506 --bp_len;
507 c = *bp++;
508 if (c == '>') {
509 ++FZ2.xsummed;
510 ++FZ2.btotal;
511 FZ2.st = DCC_FUZ2_ST_WORD;
512 break;
513 }
514 if (cks->mime_cset[c] != FC_SP) {
515 ++FZ2.xsummed;
516 ++FZ2.btotal;
517 /* don't let wild tags run forever */
518 if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
519 JUNK();
520 break;
521 }
522 }
523 } while (bp_len != 0);
524 break;
525 }
526 }
527 for (lbp = lbufs, lp = FZ2.lang; lbp <= LAST(lbufs); ++lbp, ++lp) {
528 if (lbp->blen != 0) {
529 lp->wsummed += lbp->blen;
530 FUZ2(lp, lbp->buf, lbp->blen);
531 }
532 }
533 #undef SKIP_WORD
534 #undef JUNK
535 #undef BUF_LEN
536 }
537
538
539
540 void
541 dcc_ck_fuz2_fin(DCC_GOT_CKS *cks)
542 {
543 FUZ2_LANG *lp, *lp1;
544
545 if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
546 return;
547
548 /* pick the language checksum of the most words */
549 lp = FZ2.lang;
550 for (lp1 = lp+1; lp1 <= LAST(FZ2.lang); ++lp1) {
551 if (lp->wtotal < lp1->wtotal)
552 lp = lp1;
553 }
554
555 #ifdef DCC_DEBUG_CKSUM
556 if (dcc_clnt_debug > 3)
557 printf("\n***fuz2: wtotal[%d]=%d summed=%d+%d btotal=%d\n",
558 (int)(lp-FZ2.lang),
559 lp->wtotal, lp->wsummed, FZ2.xsummed, FZ2.btotal);
560 #endif
561 /* The FUZ2 checksum is not valid if it is on a few words and
562 * less than 10% of a big, binary file */
563 if (lp->wtotal < 100
564 && (lp->wsummed+FZ2.xsummed)*10 < FZ2.btotal) {
565 cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
566 return;
567 }
568 /* We cannot compute a checksum on a nearly empty message */
569 if (lp->wtotal < 8) {
570 if (lp->wtotal + FZ2.urls*4 >= 8) {
571 /* use URLs if we lack words */
572 FUZ2(lp, FZ2.url_buf, FZ2.url_cp - FZ2.url_buf);
573 } else {
574 /* Compute a checksum for local blacklising on messages
575 * that look empty to the FUZ2 checksum but are not and
576 * are not too binary. The resulting checksum is zero.
577 * Do not report it to the DCC server. */
578 if ((lp->wsummed+FZ2.xsummed) >= 120) {
579 memset(cks->sums[DCC_CK_FUZ2].sum, 0,
580 sizeof(cks->sums[DCC_CK_FUZ2].sum));
581 return;
582 }
583 cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
584 return;
585 }
586 }
587
588 MD5Final(cks->sums[DCC_CK_FUZ2].sum, &lp->md5);
589 cks->sums[DCC_CK_FUZ2].rpt2srvr = 1;
590 }