Mercurial > notdcc
diff dcclib/ckfuz2.c @ 0:c7f6b056b673
First import of vendor version
author | Peter Gervai <grin@grin.hu> |
---|---|
date | Tue, 10 Mar 2009 13:49:58 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dcclib/ckfuz2.c Tue Mar 10 13:49:58 2009 +0100 @@ -0,0 +1,590 @@ +/* Distributed Checksum Clearinghouse + * + * compute fuzzy body checksum #2 + * + * Copyright (c) 2008 by Rhyolite Software, LLC + * + * This agreement is not applicable to any entity which sells anti-spam + * solutions to others or provides an anti-spam solution as part of a + * security solution sold to other entities, or to a private network + * which employs the DCC or uses data provided by operation of the DCC + * but does not provide corresponding data to other users. + * + * Permission to use, copy, modify, and distribute this software without + * changes for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear in all + * copies and any distributed versions or copies are either unchanged + * or not called anything similar to "DCC" or "Distributed Checksum + * Clearinghouse". + * + * Parties not eligible to receive a license under this agreement can + * obtain a commercial license to use DCC by contacting Rhyolite Software + * at sales@rhyolite.com. + * + * A commercial license would be for Distributed Checksum and Reputation + * Clearinghouse software. That software includes additional features. This + * free license for Distributed ChecksumClearinghouse Software does not in any + * way grant permision to use Distributed Checksum and Reputation Clearinghouse + * software + * + * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC + * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES + * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Rhyolite Software DCC 1.3.103-1.52 $Revision$ + */ + +#include "dcc_ck.h" + +#include "ckfuz2_tbl.h" +struct { + const char **words; + u_int len; + const u_char *cset; +} tbls[FUZ2_LAN_NUM] = { + {word_tbl0, word_tbl0_LEN, 0}, + {word_tbl1, word_tbl1_LEN, 0}, + {word_tbl2, word_tbl2_LEN, dcc_cset_2}, +}; + + +#define FZ2 cks->fuz2 + +#define BUF_LEN 1024 +typedef struct { + char buf[BUF_LEN+sizeof(DCC_FUZ2_WORD)+1]; + int blen; +} LBUF; + + +#ifdef DCC_DEBUG_CKSUM +#define FUZ2(lp, b, l) (dcc_clnt_debug == 5 \ + ? (write(1, b, l), MD5Update(&lp->md5, b, l)) \ + : MD5Update(&lp->md5, b, l)) +#else +#define FUZ2(lp, b, l) MD5Update(&lp->md5, b, l) +#endif + + +void +dcc_ck_fuz2_init(DCC_GOT_CKS *cks) +{ + FUZ2_LANG *lp; + + FZ2.wlen = 0; + DCC_FUZ2_WORD_CLEAR(&FZ2.w); + FZ2.st = DCC_FUZ2_ST_WORD; + FZ2.url_cp = FZ2.url_buf; + FZ2.urls = 0; + + cks->sums[DCC_CK_FUZ2].type = DCC_CK_FUZ2; + + FZ2.btotal = 0; + FZ2.xsummed = 0; + for (lp = FZ2.lang; lp <= LAST(FZ2.lang); ++lp) { + lp->wsummed = 0; + lp->wtotal = 0; + MD5Init(&lp->md5); + } +} + + + +static inline u_char /* 1=found it, 0=not a known word */ +lookup_word(const DCC_FUZ2_WORD *w, u_int wlen, + const char **word_tbl, u_int word_tbl_len) +{ + const char *p; + u_int n; + + p = word_tbl[fuz2_word_hash(w, word_tbl_len)]; + if (!p) + return 0; + for (;;) { + n = *p++; + if (!n) + return 0; + if (n == wlen && !memcmp(w->b, p, n)) + return 1; + p += n; + } +} + + + +static void +add_word(DCC_GOT_CKS *cks, LBUF *lbp) +{ + FUZ2_LANG *lp; + int tbl; + + tbl = 0; + for (lp = FZ2.lang; lp < &FZ2.lang[FUZ2_LAN_NUM]; ++lp, ++lbp, ++tbl) { + if (tbls[tbl].cset != 0 + && tbls[tbl].cset != cks->mime_cset) + continue; + if (lookup_word(&FZ2.w, FZ2.wlen, + tbls[tbl].words, tbls[tbl].len)) { + ++lp->wtotal; + memcpy(&lbp->buf[lbp->blen], &FZ2.w, FZ2.wlen); + if ((lbp->blen += FZ2.wlen) >= BUF_LEN) { + lp->wsummed += lbp->blen; + FUZ2(lp, lbp->buf, lbp->blen); + lbp->blen = 0; + } + } + } +} + + + +/* resolve an HTML characater reference */ +static u_char /* 0 or found value */ +lookup_cref(DCC_FUZ2_WORD *w, u_int clen) +{ + const char *p; + u_int n; + + if (clen > sizeof(DCC_FUZ2_WORD) + || clen == 0) + return 0; + p = cref_tbl[fuz2_word_hash(w, cref_tbl_LEN)]; + if (!p) + return 0; + for (;;) { + n = *p++; + if (!n) + return 0; + if (n == clen && !memcmp(w->b, p, n)) + return p[clen]; + p += n+1; + } +} + + + +static int +adv_cref(DCC_GOT_CKS *cks, u_char c) +{ + switch (FZ2.cref_st) { + case DCC_CREF_ST_IDLE: + dcc_logbad(EX_SOFTWARE, "impossible fuz2 cref state"); + break; + + case DCC_CREF_ST_START: + /* start to get a possible HTML character reference + * We have already passed the '&' */ + if (c == '#') { + FZ2.cref_st = DCC_CREF_ST_NUM; + } else if ((c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z')) { + DCC_FUZ2_WORD_CLEAR(&FZ2.cref_w); + FZ2.cref_w.b[0] = c; + FZ2.cref_cnt = 1; + FZ2.cref_st = DCC_CREF_ST_NAME; + } else { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return -2; + } + break; + + case DCC_CREF_ST_NUM: + /* look for 'x' or the first digit */ + if (c == 'x' || c == 'X') { + FZ2.cref_st = DCC_CREF_ST_HEX; + FZ2.cref_cnt = 0; + } else if (c >= '0' && c <= '9') { + FZ2.cref_cnt = c - '0'; + FZ2.cref_st = DCC_CREF_ST_DEC; + } else { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return -2; + } + break; + + case DCC_CREF_ST_DEC: + if (c >= '0' && c <= '9') { + FZ2.cref_cnt = FZ2.cref_cnt*10 + (c - '0'); + } else if (cks->mime_cset[c] == FC_SP) { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return c == ';'; + } else { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return -2; + } + if (FZ2.cref_cnt > 256) + FZ2.cref_cnt = 256; + ++FZ2.btotal; + break; + + case DCC_CREF_ST_HEX: + if ((c >= 'a' && c <= 'f') + || (c >= 'A' && c <= 'F')) { + FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c & 0xf) + 9; + } else if (c >= '0' && c <= '9') { + FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c - '0'); + } else if (cks->mime_cset[c] == FC_SP) { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return c == ';'; + } else { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return -2; + } + if (FZ2.cref_cnt > 256) + FZ2.cref_cnt = 256; + ++FZ2.btotal; + break; + + case DCC_CREF_ST_NAME: + if ((c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z')) { + /* If the word is too long, the final match will fail. + * This will consume it */ + if (FZ2.cref_cnt < sizeof(DCC_FUZ2_WORD)) + FZ2.cref_w.b[FZ2.cref_cnt++] = c; + } else if (cks->mime_cset[c] == FC_SP) { + /* this character ends the cref */ + FZ2.cref_cnt = lookup_cref(&FZ2.cref_w, FZ2.cref_cnt); + FZ2.cref_st = DCC_CREF_ST_IDLE; + return c == ';'; + } else { + FZ2.cref_st = DCC_CREF_ST_IDLE; + return -2; + } + break; + } + return -1; +} + + +void +dcc_ck_fuz2(DCC_GOT_CKS *cks, const char *bp, u_int bp_len) +{ +#define SKIP_WORD() (FZ2.wlen = sizeof(DCC_FUZ2_WORD)+1) +#define JUNK() (SKIP_WORD(), FZ2.st = DCC_FUZ2_ST_WORD) + LBUF *lbp, lbufs[FUZ2_LAN_NUM]; + FUZ2_LANG *lp; + int i; + char *p; + u_char c; + + if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2) + return; + + for (lbp = lbufs; lbp <= LAST(lbufs); ++lbp) + lbp->blen = 0; + + while (bp_len != 0) { + switch (FZ2.st) { + case DCC_FUZ2_ST_WORD: + /* gathering a word */ + do { + if (FZ2.cref_st == DCC_CREF_ST_IDLE) { + --bp_len; + c = *bp++; + c = cks->mime_cset[c]; + if (c == FC_CF) { + if (cks->mime_ct == DCC_CK_CT_HTML){ + FZ2.cref_st = DCC_CREF_ST_START; + break; + } + c = FC_SP; + } + } else { + i = adv_cref(cks, *bp); + if (i == -2) { + JUNK(); /* bogus cref */ + continue; + } + if (i < 0) { /* get more of cref */ + --bp_len; + ++bp; + continue; + } + bp += i; /* use complete cref */ + bp_len -= i; + c = cks->mime_cset[FZ2.cref_cnt]; + + if (c == FC_SK) + continue; /* ignore accent mark */ + } + + if (c >= FC_A) { + ++FZ2.btotal; + if (FZ2.wlen < sizeof(DCC_FUZ2_WORD)) + FZ2.w.b[FZ2.wlen++] = c; + else + SKIP_WORD(); + continue; + } + + if (c == FC_SP) { + if (FZ2.wlen >= MIN_WLEN + && FZ2.wlen <=sizeof(DCC_FUZ2_WORD)) + add_word(cks, lbufs); + FZ2.wlen = 0; + DCC_FUZ2_WORD_CLEAR(&FZ2.w); + continue; + } + ++FZ2.btotal; + + if (c == FC_LT) { + FZ2.tag_len = 0; + DCC_FUZ2_WORD_CLEAR(&FZ2.tag); + FZ2.st = DCC_FUZ2_ST_START_TAG; + break; + } + + JUNK(); + } while (bp_len != 0); + break; + + case DCC_FUZ2_ST_START_TAG: + /* collecting an HTML tag or comment + * We've passed the '<' */ + c = *bp; +#define SAVE_TAG(_c) (FZ2.tag.b[FZ2.tag_len++] = _c, \ + ++FZ2.btotal, ++bp, --bp_len) + if (((c >= 'a' && c <= 'z') /* tag */ + || (c >= '0' && c <= '9')) + && FZ2.tag_len < sizeof(FZ2.tag)) { + SAVE_TAG(c); + break; + } + if (c >= 'A' && c <= 'Z' + && FZ2.tag_len < sizeof(FZ2.tag)) { + SAVE_TAG(c - ('A'-'a')); + break; + } + if ((c == '/' /* end-tag */ + || c == '!') /* start of comment */ + && FZ2.tag_len == 0) { + SAVE_TAG(c); + break; + } + if (c == '-' /* comment */ + && FZ2.tag_len >= 1 && FZ2.tag_len <= 2) { + SAVE_TAG(c); + break; + } +#undef SAVE_TAG + + /* notice an <html> tag while in text/plain + * and switch to text/html */ + if (FZ2.tag_len == 4 + && cks->mime_ct != DCC_CK_CT_HTML + && !memcmp(FZ2.tag.b, "html", 4)) + cks->mime_ct = DCC_CK_CT_HTML; + + if (cks->mime_ct == DCC_CK_CT_HTML + && FZ2.tag_len > 0) { + /* if we are in an HTML document and we + * have at least one character after '<', + * assume it is some kind of HTML tag */ + FZ2.xsummed += FZ2.tag_len+1; /* count '<' */ + if (c == '>') { + /* optimize common simple tags */ + ++FZ2.xsummed; + ++FZ2.btotal; + ++bp, --bp_len; + FZ2.st = DCC_FUZ2_ST_WORD; + break; + } + if (FZ2.tag_len >= 3 + && !memcmp(FZ2.tag.b, "!--", 3)) { + FZ2.st = DCC_FUZ2_ST_SKIP_COMMENT; + } else { + FZ2.url.st = DCC_URL_ST_IDLE; + FZ2.st = DCC_FUZ2_ST_SKIP_TAG; + } + } else { + /* assume it is not an HTML tag and + * mark the whole word as junk */ + JUNK(); + } + break; + + case DCC_FUZ2_ST_SKIP_TAG: + /* Skip rest of boring HTML tag + * We ought to ignore '>' in quotes */ + do { + if (FZ2.cref_st == DCC_CREF_ST_IDLE) { + --bp_len; + c = *bp++; + if (c == FC_CF) { + FZ2.cref_st = DCC_CREF_ST_START; + continue; + } + } else { + i = adv_cref(cks, *bp); + if (i == -2) + continue; + if (i < 0) { /* get more of cref */ + --bp_len; + ++bp; + continue; + } + bp += i; /* use complete cref */ + bp_len -= i; + c = FZ2.cref_cnt; + } + + /* capture URLs */ + i = dcc_ck_url(&FZ2.url, c, &FZ2.url_cp); + c = i>>DCC_CK_URL_SHIFT; + switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) { + case DCC_CK_URL_CHAR: + break; + case DCC_CK_URL_CK_LEN: + /* Make room before starting a URL + * if we are too close to + * end of buffer for a maximum size URL. + * Discard the first URL in the buffer. + * This relies on dcc_ck_url() limiting + * the URL to DCC_URL_MAX bytes */ + while (FZ2.url_cp + >= &FZ2.url_buf[ISZ(FZ2.url_buf) + - DCC_FUZ2_URL_MAX]) { + p = memchr(FZ2.url_buf, '\0', + FZ2.url_cp-FZ2.url_buf); + if (!p) { + /* if this was the first URL, + * discard half of it */ + p = &FZ2.url_buf[DCC_URL_MAX/2]; + } else { + ++p; + } + memmove(FZ2.url_buf, p, + FZ2.url_cp - p); + FZ2.url_cp -= p - FZ2.url_buf; + } + if (FZ2.url_cp != FZ2.url_buf) + *FZ2.url_cp++ = '\0'; + ++FZ2.urls; + break; + case DCC_CK_URL_HOST: + case DCC_CK_URL_DOT: + if (FZ2.url_cp + < &FZ2.url_buf[ISZ(FZ2.url_buf)]) + *FZ2.url_cp++ = c; + continue; + case DCC_CK_URL_HOST_END: + case DCC_CK_URL_HOST_RESET: + case DCC_CK_URL_SKIP: + continue; + } + + if (c == '>') { + ++FZ2.xsummed; + ++FZ2.btotal; + FZ2.st = DCC_FUZ2_ST_WORD; + break; + } + if (cks->mime_cset[c] != FC_SP) { + ++FZ2.xsummed; + ++FZ2.btotal; + /* don't let wild tags run forever */ + if (++FZ2.tag_len > DCC_URL_FAILSAFE) { + JUNK(); + break; + } + } + } while (bp_len != 0); + break; + + case DCC_FUZ2_ST_SKIP_COMMENT: + /* HTML comments can include HTML tags, + * but spammers don't understand HTML comment syntax + * and Netscape and IE treat (and ignore) broken + * comments like strange tags. */ + do { + --bp_len; + c = *bp++; + if (c == '>') { + ++FZ2.xsummed; + ++FZ2.btotal; + FZ2.st = DCC_FUZ2_ST_WORD; + break; + } + if (cks->mime_cset[c] != FC_SP) { + ++FZ2.xsummed; + ++FZ2.btotal; + /* don't let wild tags run forever */ + if (++FZ2.tag_len > DCC_URL_FAILSAFE) { + JUNK(); + break; + } + } + } while (bp_len != 0); + break; + } + } + for (lbp = lbufs, lp = FZ2.lang; lbp <= LAST(lbufs); ++lbp, ++lp) { + if (lbp->blen != 0) { + lp->wsummed += lbp->blen; + FUZ2(lp, lbp->buf, lbp->blen); + } + } +#undef SKIP_WORD +#undef JUNK +#undef BUF_LEN +} + + + +void +dcc_ck_fuz2_fin(DCC_GOT_CKS *cks) +{ + FUZ2_LANG *lp, *lp1; + + if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2) + return; + + /* pick the language checksum of the most words */ + lp = FZ2.lang; + for (lp1 = lp+1; lp1 <= LAST(FZ2.lang); ++lp1) { + if (lp->wtotal < lp1->wtotal) + lp = lp1; + } + +#ifdef DCC_DEBUG_CKSUM + if (dcc_clnt_debug > 3) + printf("\n***fuz2: wtotal[%d]=%d summed=%d+%d btotal=%d\n", + (int)(lp-FZ2.lang), + lp->wtotal, lp->wsummed, FZ2.xsummed, FZ2.btotal); +#endif + /* The FUZ2 checksum is not valid if it is on a few words and + * less than 10% of a big, binary file */ + if (lp->wtotal < 100 + && (lp->wsummed+FZ2.xsummed)*10 < FZ2.btotal) { + cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID; + return; + } + /* We cannot compute a checksum on a nearly empty message */ + if (lp->wtotal < 8) { + if (lp->wtotal + FZ2.urls*4 >= 8) { + /* use URLs if we lack words */ + FUZ2(lp, FZ2.url_buf, FZ2.url_cp - FZ2.url_buf); + } else { + /* Compute a checksum for local blacklising on messages + * that look empty to the FUZ2 checksum but are not and + * are not too binary. The resulting checksum is zero. + * Do not report it to the DCC server. */ + if ((lp->wsummed+FZ2.xsummed) >= 120) { + memset(cks->sums[DCC_CK_FUZ2].sum, 0, + sizeof(cks->sums[DCC_CK_FUZ2].sum)); + return; + } + cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID; + return; + } + } + + MD5Final(cks->sums[DCC_CK_FUZ2].sum, &lp->md5); + cks->sums[DCC_CK_FUZ2].rpt2srvr = 1; +}