Mercurial > notdcc
view dcclib/ckfuz2.c @ 0:c7f6b056b673
First import of vendor version
author | Peter Gervai <grin@grin.hu> |
---|---|
date | Tue, 10 Mar 2009 13:49:58 +0100 |
parents | |
children |
line wrap: on
line source
/* Distributed Checksum Clearinghouse * * compute fuzzy body checksum #2 * * Copyright (c) 2008 by Rhyolite Software, LLC * * This agreement is not applicable to any entity which sells anti-spam * solutions to others or provides an anti-spam solution as part of a * security solution sold to other entities, or to a private network * which employs the DCC or uses data provided by operation of the DCC * but does not provide corresponding data to other users. * * Permission to use, copy, modify, and distribute this software without * changes for any purpose with or without fee is hereby granted, provided * that the above copyright notice and this permission notice appear in all * copies and any distributed versions or copies are either unchanged * or not called anything similar to "DCC" or "Distributed Checksum * Clearinghouse". * * Parties not eligible to receive a license under this agreement can * obtain a commercial license to use DCC by contacting Rhyolite Software * at sales@rhyolite.com. * * A commercial license would be for Distributed Checksum and Reputation * Clearinghouse software. That software includes additional features. This * free license for Distributed ChecksumClearinghouse Software does not in any * way grant permision to use Distributed Checksum and Reputation Clearinghouse * software * * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Rhyolite Software DCC 1.3.103-1.52 $Revision$ */ #include "dcc_ck.h" #include "ckfuz2_tbl.h" struct { const char **words; u_int len; const u_char *cset; } tbls[FUZ2_LAN_NUM] = { {word_tbl0, word_tbl0_LEN, 0}, {word_tbl1, word_tbl1_LEN, 0}, {word_tbl2, word_tbl2_LEN, dcc_cset_2}, }; #define FZ2 cks->fuz2 #define BUF_LEN 1024 typedef struct { char buf[BUF_LEN+sizeof(DCC_FUZ2_WORD)+1]; int blen; } LBUF; #ifdef DCC_DEBUG_CKSUM #define FUZ2(lp, b, l) (dcc_clnt_debug == 5 \ ? (write(1, b, l), MD5Update(&lp->md5, b, l)) \ : MD5Update(&lp->md5, b, l)) #else #define FUZ2(lp, b, l) MD5Update(&lp->md5, b, l) #endif void dcc_ck_fuz2_init(DCC_GOT_CKS *cks) { FUZ2_LANG *lp; FZ2.wlen = 0; DCC_FUZ2_WORD_CLEAR(&FZ2.w); FZ2.st = DCC_FUZ2_ST_WORD; FZ2.url_cp = FZ2.url_buf; FZ2.urls = 0; cks->sums[DCC_CK_FUZ2].type = DCC_CK_FUZ2; FZ2.btotal = 0; FZ2.xsummed = 0; for (lp = FZ2.lang; lp <= LAST(FZ2.lang); ++lp) { lp->wsummed = 0; lp->wtotal = 0; MD5Init(&lp->md5); } } static inline u_char /* 1=found it, 0=not a known word */ lookup_word(const DCC_FUZ2_WORD *w, u_int wlen, const char **word_tbl, u_int word_tbl_len) { const char *p; u_int n; p = word_tbl[fuz2_word_hash(w, word_tbl_len)]; if (!p) return 0; for (;;) { n = *p++; if (!n) return 0; if (n == wlen && !memcmp(w->b, p, n)) return 1; p += n; } } static void add_word(DCC_GOT_CKS *cks, LBUF *lbp) { FUZ2_LANG *lp; int tbl; tbl = 0; for (lp = FZ2.lang; lp < &FZ2.lang[FUZ2_LAN_NUM]; ++lp, ++lbp, ++tbl) { if (tbls[tbl].cset != 0 && tbls[tbl].cset != cks->mime_cset) continue; if (lookup_word(&FZ2.w, FZ2.wlen, tbls[tbl].words, tbls[tbl].len)) { ++lp->wtotal; memcpy(&lbp->buf[lbp->blen], &FZ2.w, FZ2.wlen); if ((lbp->blen += FZ2.wlen) >= BUF_LEN) { lp->wsummed += lbp->blen; FUZ2(lp, lbp->buf, lbp->blen); lbp->blen = 0; } } } } /* resolve an HTML characater reference */ static u_char /* 0 or found value */ lookup_cref(DCC_FUZ2_WORD *w, u_int clen) { const char *p; u_int n; if (clen > sizeof(DCC_FUZ2_WORD) || clen == 0) return 0; p = cref_tbl[fuz2_word_hash(w, cref_tbl_LEN)]; if (!p) return 0; for (;;) { n = *p++; if (!n) return 0; if (n == clen && !memcmp(w->b, p, n)) return p[clen]; p += n+1; } } static int adv_cref(DCC_GOT_CKS *cks, u_char c) { switch (FZ2.cref_st) { case DCC_CREF_ST_IDLE: dcc_logbad(EX_SOFTWARE, "impossible fuz2 cref state"); break; case DCC_CREF_ST_START: /* start to get a possible HTML character reference * We have already passed the '&' */ if (c == '#') { FZ2.cref_st = DCC_CREF_ST_NUM; } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { DCC_FUZ2_WORD_CLEAR(&FZ2.cref_w); FZ2.cref_w.b[0] = c; FZ2.cref_cnt = 1; FZ2.cref_st = DCC_CREF_ST_NAME; } else { FZ2.cref_st = DCC_CREF_ST_IDLE; return -2; } break; case DCC_CREF_ST_NUM: /* look for 'x' or the first digit */ if (c == 'x' || c == 'X') { FZ2.cref_st = DCC_CREF_ST_HEX; FZ2.cref_cnt = 0; } else if (c >= '0' && c <= '9') { FZ2.cref_cnt = c - '0'; FZ2.cref_st = DCC_CREF_ST_DEC; } else { FZ2.cref_st = DCC_CREF_ST_IDLE; return -2; } break; case DCC_CREF_ST_DEC: if (c >= '0' && c <= '9') { FZ2.cref_cnt = FZ2.cref_cnt*10 + (c - '0'); } else if (cks->mime_cset[c] == FC_SP) { FZ2.cref_st = DCC_CREF_ST_IDLE; return c == ';'; } else { FZ2.cref_st = DCC_CREF_ST_IDLE; return -2; } if (FZ2.cref_cnt > 256) FZ2.cref_cnt = 256; ++FZ2.btotal; break; case DCC_CREF_ST_HEX: if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c & 0xf) + 9; } else if (c >= '0' && c <= '9') { FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c - '0'); } else if (cks->mime_cset[c] == FC_SP) { FZ2.cref_st = DCC_CREF_ST_IDLE; return c == ';'; } else { FZ2.cref_st = DCC_CREF_ST_IDLE; return -2; } if (FZ2.cref_cnt > 256) FZ2.cref_cnt = 256; ++FZ2.btotal; break; case DCC_CREF_ST_NAME: if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { /* If the word is too long, the final match will fail. * This will consume it */ if (FZ2.cref_cnt < sizeof(DCC_FUZ2_WORD)) FZ2.cref_w.b[FZ2.cref_cnt++] = c; } else if (cks->mime_cset[c] == FC_SP) { /* this character ends the cref */ FZ2.cref_cnt = lookup_cref(&FZ2.cref_w, FZ2.cref_cnt); FZ2.cref_st = DCC_CREF_ST_IDLE; return c == ';'; } else { FZ2.cref_st = DCC_CREF_ST_IDLE; return -2; } break; } return -1; } void dcc_ck_fuz2(DCC_GOT_CKS *cks, const char *bp, u_int bp_len) { #define SKIP_WORD() (FZ2.wlen = sizeof(DCC_FUZ2_WORD)+1) #define JUNK() (SKIP_WORD(), FZ2.st = DCC_FUZ2_ST_WORD) LBUF *lbp, lbufs[FUZ2_LAN_NUM]; FUZ2_LANG *lp; int i; char *p; u_char c; if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2) return; for (lbp = lbufs; lbp <= LAST(lbufs); ++lbp) lbp->blen = 0; while (bp_len != 0) { switch (FZ2.st) { case DCC_FUZ2_ST_WORD: /* gathering a word */ do { if (FZ2.cref_st == DCC_CREF_ST_IDLE) { --bp_len; c = *bp++; c = cks->mime_cset[c]; if (c == FC_CF) { if (cks->mime_ct == DCC_CK_CT_HTML){ FZ2.cref_st = DCC_CREF_ST_START; break; } c = FC_SP; } } else { i = adv_cref(cks, *bp); if (i == -2) { JUNK(); /* bogus cref */ continue; } if (i < 0) { /* get more of cref */ --bp_len; ++bp; continue; } bp += i; /* use complete cref */ bp_len -= i; c = cks->mime_cset[FZ2.cref_cnt]; if (c == FC_SK) continue; /* ignore accent mark */ } if (c >= FC_A) { ++FZ2.btotal; if (FZ2.wlen < sizeof(DCC_FUZ2_WORD)) FZ2.w.b[FZ2.wlen++] = c; else SKIP_WORD(); continue; } if (c == FC_SP) { if (FZ2.wlen >= MIN_WLEN && FZ2.wlen <=sizeof(DCC_FUZ2_WORD)) add_word(cks, lbufs); FZ2.wlen = 0; DCC_FUZ2_WORD_CLEAR(&FZ2.w); continue; } ++FZ2.btotal; if (c == FC_LT) { FZ2.tag_len = 0; DCC_FUZ2_WORD_CLEAR(&FZ2.tag); FZ2.st = DCC_FUZ2_ST_START_TAG; break; } JUNK(); } while (bp_len != 0); break; case DCC_FUZ2_ST_START_TAG: /* collecting an HTML tag or comment * We've passed the '<' */ c = *bp; #define SAVE_TAG(_c) (FZ2.tag.b[FZ2.tag_len++] = _c, \ ++FZ2.btotal, ++bp, --bp_len) if (((c >= 'a' && c <= 'z') /* tag */ || (c >= '0' && c <= '9')) && FZ2.tag_len < sizeof(FZ2.tag)) { SAVE_TAG(c); break; } if (c >= 'A' && c <= 'Z' && FZ2.tag_len < sizeof(FZ2.tag)) { SAVE_TAG(c - ('A'-'a')); break; } if ((c == '/' /* end-tag */ || c == '!') /* start of comment */ && FZ2.tag_len == 0) { SAVE_TAG(c); break; } if (c == '-' /* comment */ && FZ2.tag_len >= 1 && FZ2.tag_len <= 2) { SAVE_TAG(c); break; } #undef SAVE_TAG /* notice an <html> tag while in text/plain * and switch to text/html */ if (FZ2.tag_len == 4 && cks->mime_ct != DCC_CK_CT_HTML && !memcmp(FZ2.tag.b, "html", 4)) cks->mime_ct = DCC_CK_CT_HTML; if (cks->mime_ct == DCC_CK_CT_HTML && FZ2.tag_len > 0) { /* if we are in an HTML document and we * have at least one character after '<', * assume it is some kind of HTML tag */ FZ2.xsummed += FZ2.tag_len+1; /* count '<' */ if (c == '>') { /* optimize common simple tags */ ++FZ2.xsummed; ++FZ2.btotal; ++bp, --bp_len; FZ2.st = DCC_FUZ2_ST_WORD; break; } if (FZ2.tag_len >= 3 && !memcmp(FZ2.tag.b, "!--", 3)) { FZ2.st = DCC_FUZ2_ST_SKIP_COMMENT; } else { FZ2.url.st = DCC_URL_ST_IDLE; FZ2.st = DCC_FUZ2_ST_SKIP_TAG; } } else { /* assume it is not an HTML tag and * mark the whole word as junk */ JUNK(); } break; case DCC_FUZ2_ST_SKIP_TAG: /* Skip rest of boring HTML tag * We ought to ignore '>' in quotes */ do { if (FZ2.cref_st == DCC_CREF_ST_IDLE) { --bp_len; c = *bp++; if (c == FC_CF) { FZ2.cref_st = DCC_CREF_ST_START; continue; } } else { i = adv_cref(cks, *bp); if (i == -2) continue; if (i < 0) { /* get more of cref */ --bp_len; ++bp; continue; } bp += i; /* use complete cref */ bp_len -= i; c = FZ2.cref_cnt; } /* capture URLs */ i = dcc_ck_url(&FZ2.url, c, &FZ2.url_cp); c = i>>DCC_CK_URL_SHIFT; switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) { case DCC_CK_URL_CHAR: break; case DCC_CK_URL_CK_LEN: /* Make room before starting a URL * if we are too close to * end of buffer for a maximum size URL. * Discard the first URL in the buffer. * This relies on dcc_ck_url() limiting * the URL to DCC_URL_MAX bytes */ while (FZ2.url_cp >= &FZ2.url_buf[ISZ(FZ2.url_buf) - DCC_FUZ2_URL_MAX]) { p = memchr(FZ2.url_buf, '\0', FZ2.url_cp-FZ2.url_buf); if (!p) { /* if this was the first URL, * discard half of it */ p = &FZ2.url_buf[DCC_URL_MAX/2]; } else { ++p; } memmove(FZ2.url_buf, p, FZ2.url_cp - p); FZ2.url_cp -= p - FZ2.url_buf; } if (FZ2.url_cp != FZ2.url_buf) *FZ2.url_cp++ = '\0'; ++FZ2.urls; break; case DCC_CK_URL_HOST: case DCC_CK_URL_DOT: if (FZ2.url_cp < &FZ2.url_buf[ISZ(FZ2.url_buf)]) *FZ2.url_cp++ = c; continue; case DCC_CK_URL_HOST_END: case DCC_CK_URL_HOST_RESET: case DCC_CK_URL_SKIP: continue; } if (c == '>') { ++FZ2.xsummed; ++FZ2.btotal; FZ2.st = DCC_FUZ2_ST_WORD; break; } if (cks->mime_cset[c] != FC_SP) { ++FZ2.xsummed; ++FZ2.btotal; /* don't let wild tags run forever */ if (++FZ2.tag_len > DCC_URL_FAILSAFE) { JUNK(); break; } } } while (bp_len != 0); break; case DCC_FUZ2_ST_SKIP_COMMENT: /* HTML comments can include HTML tags, * but spammers don't understand HTML comment syntax * and Netscape and IE treat (and ignore) broken * comments like strange tags. */ do { --bp_len; c = *bp++; if (c == '>') { ++FZ2.xsummed; ++FZ2.btotal; FZ2.st = DCC_FUZ2_ST_WORD; break; } if (cks->mime_cset[c] != FC_SP) { ++FZ2.xsummed; ++FZ2.btotal; /* don't let wild tags run forever */ if (++FZ2.tag_len > DCC_URL_FAILSAFE) { JUNK(); break; } } } while (bp_len != 0); break; } } for (lbp = lbufs, lp = FZ2.lang; lbp <= LAST(lbufs); ++lbp, ++lp) { if (lbp->blen != 0) { lp->wsummed += lbp->blen; FUZ2(lp, lbp->buf, lbp->blen); } } #undef SKIP_WORD #undef JUNK #undef BUF_LEN } void dcc_ck_fuz2_fin(DCC_GOT_CKS *cks) { FUZ2_LANG *lp, *lp1; if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2) return; /* pick the language checksum of the most words */ lp = FZ2.lang; for (lp1 = lp+1; lp1 <= LAST(FZ2.lang); ++lp1) { if (lp->wtotal < lp1->wtotal) lp = lp1; } #ifdef DCC_DEBUG_CKSUM if (dcc_clnt_debug > 3) printf("\n***fuz2: wtotal[%d]=%d summed=%d+%d btotal=%d\n", (int)(lp-FZ2.lang), lp->wtotal, lp->wsummed, FZ2.xsummed, FZ2.btotal); #endif /* The FUZ2 checksum is not valid if it is on a few words and * less than 10% of a big, binary file */ if (lp->wtotal < 100 && (lp->wsummed+FZ2.xsummed)*10 < FZ2.btotal) { cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID; return; } /* We cannot compute a checksum on a nearly empty message */ if (lp->wtotal < 8) { if (lp->wtotal + FZ2.urls*4 >= 8) { /* use URLs if we lack words */ FUZ2(lp, FZ2.url_buf, FZ2.url_cp - FZ2.url_buf); } else { /* Compute a checksum for local blacklising on messages * that look empty to the FUZ2 checksum but are not and * are not too binary. The resulting checksum is zero. * Do not report it to the DCC server. */ if ((lp->wsummed+FZ2.xsummed) >= 120) { memset(cks->sums[DCC_CK_FUZ2].sum, 0, sizeof(cks->sums[DCC_CK_FUZ2].sum)); return; } cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID; return; } } MD5Final(cks->sums[DCC_CK_FUZ2].sum, &lp->md5); cks->sums[DCC_CK_FUZ2].rpt2srvr = 1; }