view dcclib/ckfuz2.c @ 0:c7f6b056b673

First import of vendor version
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 13:49:58 +0100
parents
children
line wrap: on
line source

/* Distributed Checksum Clearinghouse
 *
 * compute fuzzy body checksum #2
 *
 * Copyright (c) 2008 by Rhyolite Software, LLC
 *
 * This agreement is not applicable to any entity which sells anti-spam
 * solutions to others or provides an anti-spam solution as part of a
 * security solution sold to other entities, or to a private network
 * which employs the DCC or uses data provided by operation of the DCC
 * but does not provide corresponding data to other users.
 *
 * Permission to use, copy, modify, and distribute this software without
 * changes for any purpose with or without fee is hereby granted, provided
 * that the above copyright notice and this permission notice appear in all
 * copies and any distributed versions or copies are either unchanged
 * or not called anything similar to "DCC" or "Distributed Checksum
 * Clearinghouse".
 *
 * Parties not eligible to receive a license under this agreement can
 * obtain a commercial license to use DCC by contacting Rhyolite Software
 * at sales@rhyolite.com.
 *
 * A commercial license would be for Distributed Checksum and Reputation
 * Clearinghouse software.  That software includes additional features.  This
 * free license for Distributed ChecksumClearinghouse Software does not in any
 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
 * software
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Rhyolite Software DCC 1.3.103-1.52 $Revision$
 */

#include "dcc_ck.h"

#include "ckfuz2_tbl.h"
struct {
    const char **words;
    u_int	len;
    const u_char *cset;
} tbls[FUZ2_LAN_NUM] = {
    {word_tbl0, word_tbl0_LEN, 0},
    {word_tbl1, word_tbl1_LEN, 0},
    {word_tbl2, word_tbl2_LEN, dcc_cset_2},
};


#define FZ2 cks->fuz2

#define BUF_LEN 1024
typedef struct {
    char buf[BUF_LEN+sizeof(DCC_FUZ2_WORD)+1];
    int blen;
} LBUF;


#ifdef DCC_DEBUG_CKSUM
#define FUZ2(lp, b, l) (dcc_clnt_debug == 5				\
			? (write(1, b, l), MD5Update(&lp->md5, b, l))	\
			: MD5Update(&lp->md5, b, l))
#else
#define FUZ2(lp, b, l) MD5Update(&lp->md5, b, l)
#endif


void
dcc_ck_fuz2_init(DCC_GOT_CKS *cks)
{
	FUZ2_LANG *lp;

	FZ2.wlen = 0;
	DCC_FUZ2_WORD_CLEAR(&FZ2.w);
	FZ2.st = DCC_FUZ2_ST_WORD;
	FZ2.url_cp = FZ2.url_buf;
	FZ2.urls = 0;

	cks->sums[DCC_CK_FUZ2].type = DCC_CK_FUZ2;

	FZ2.btotal = 0;
	FZ2.xsummed = 0;
	for (lp = FZ2.lang; lp <= LAST(FZ2.lang); ++lp) {
		lp->wsummed = 0;
		lp->wtotal = 0;
		MD5Init(&lp->md5);
	}
}



static inline u_char			/* 1=found it, 0=not a known word */
lookup_word(const DCC_FUZ2_WORD *w, u_int wlen,
	    const char **word_tbl, u_int word_tbl_len)
{
	const char *p;
	u_int n;

	p = word_tbl[fuz2_word_hash(w, word_tbl_len)];
	if (!p)
		return 0;
	for (;;) {
		n = *p++;
		if (!n)
			return 0;
		if (n == wlen && !memcmp(w->b, p, n))
			return 1;
		p += n;
	}
}



static void
add_word(DCC_GOT_CKS *cks, LBUF *lbp)
{
	FUZ2_LANG *lp;
	int tbl;

	tbl = 0;
	for (lp = FZ2.lang; lp < &FZ2.lang[FUZ2_LAN_NUM]; ++lp, ++lbp, ++tbl) {
		if (tbls[tbl].cset != 0
		    && tbls[tbl].cset != cks->mime_cset)
			continue;
		if (lookup_word(&FZ2.w, FZ2.wlen,
				tbls[tbl].words, tbls[tbl].len)) {
			++lp->wtotal;
			memcpy(&lbp->buf[lbp->blen], &FZ2.w, FZ2.wlen);
			if ((lbp->blen += FZ2.wlen) >= BUF_LEN) {
				lp->wsummed += lbp->blen;
				FUZ2(lp, lbp->buf, lbp->blen);
				lbp->blen = 0;
			}
		}
	}
}



/* resolve an HTML characater reference */
static u_char				/* 0 or found value */
lookup_cref(DCC_FUZ2_WORD *w, u_int clen)
{
	const char *p;
	u_int n;

	if (clen > sizeof(DCC_FUZ2_WORD)
	    || clen == 0)
		return 0;
	p = cref_tbl[fuz2_word_hash(w, cref_tbl_LEN)];
	if (!p)
		return 0;
	for (;;) {
		n = *p++;
		if (!n)
			return 0;
		if (n == clen && !memcmp(w->b, p, n))
			return p[clen];
		p += n+1;
	}
}



static int
adv_cref(DCC_GOT_CKS *cks, u_char c)
{
	switch (FZ2.cref_st) {
	case DCC_CREF_ST_IDLE:
		dcc_logbad(EX_SOFTWARE, "impossible fuz2 cref state");
		break;

	case DCC_CREF_ST_START:
		/* start to get a possible HTML character reference
		 * We have already passed the '&' */
		if (c == '#') {
			FZ2.cref_st = DCC_CREF_ST_NUM;
		} else if ((c >= 'a' && c <= 'z')
			   || (c >= 'A' && c <= 'Z')) {
			DCC_FUZ2_WORD_CLEAR(&FZ2.cref_w);
			FZ2.cref_w.b[0] = c;
			FZ2.cref_cnt = 1;
			FZ2.cref_st = DCC_CREF_ST_NAME;
		} else {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return -2;
		}
		break;

	case DCC_CREF_ST_NUM:
		/* look for 'x' or the first digit */
		if (c == 'x' || c == 'X') {
			FZ2.cref_st = DCC_CREF_ST_HEX;
			FZ2.cref_cnt = 0;
		} else if (c >= '0' && c <= '9') {
			FZ2.cref_cnt = c - '0';
			FZ2.cref_st = DCC_CREF_ST_DEC;
		} else {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return -2;
		}
		break;

	case DCC_CREF_ST_DEC:
		if (c >= '0' && c <= '9') {
			FZ2.cref_cnt = FZ2.cref_cnt*10 + (c - '0');
		} else if (cks->mime_cset[c] == FC_SP) {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return c == ';';
		} else {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return -2;
		}
		if (FZ2.cref_cnt > 256)
			FZ2.cref_cnt = 256;
		++FZ2.btotal;
		break;

	case DCC_CREF_ST_HEX:
		if ((c >= 'a' && c <= 'f')
		    || (c >= 'A' && c <= 'F')) {
			FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c & 0xf) + 9;
		} else if (c >= '0' && c <= '9') {
			FZ2.cref_cnt = (FZ2.cref_cnt<<4) + (c - '0');
		} else if (cks->mime_cset[c] == FC_SP) {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return c == ';';
		} else {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return -2;
		}
		if (FZ2.cref_cnt > 256)
			FZ2.cref_cnt = 256;
		++FZ2.btotal;
		break;

	case DCC_CREF_ST_NAME:
		if ((c >= 'a' && c <= 'z')
		    || (c >= 'A' && c <= 'Z')) {
			/* If the word is too long, the final match will fail.
			 * This will consume it */
			if (FZ2.cref_cnt < sizeof(DCC_FUZ2_WORD))
				FZ2.cref_w.b[FZ2.cref_cnt++] = c;
		} else if (cks->mime_cset[c] == FC_SP) {
			/* this character ends the cref */
			FZ2.cref_cnt = lookup_cref(&FZ2.cref_w, FZ2.cref_cnt);
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return c == ';';
		} else {
			FZ2.cref_st = DCC_CREF_ST_IDLE;
			return -2;
		}
		break;
	}
	return -1;
}


void
dcc_ck_fuz2(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
{
#define SKIP_WORD() (FZ2.wlen = sizeof(DCC_FUZ2_WORD)+1)
#define JUNK() (SKIP_WORD(), FZ2.st = DCC_FUZ2_ST_WORD)
	LBUF *lbp, lbufs[FUZ2_LAN_NUM];
	FUZ2_LANG *lp;
	int i;
	char *p;
	u_char c;

	if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
		return;

	for (lbp = lbufs; lbp <= LAST(lbufs); ++lbp)
		lbp->blen = 0;

	while (bp_len != 0) {
		switch (FZ2.st) {
		case DCC_FUZ2_ST_WORD:
			/* gathering a word */
			do {
				if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
					--bp_len;
					c = *bp++;
					c = cks->mime_cset[c];
					if (c == FC_CF) {
					    if (cks->mime_ct == DCC_CK_CT_HTML){
						FZ2.cref_st = DCC_CREF_ST_START;
						break;
					    }
					    c = FC_SP;
					}
				} else {
					i = adv_cref(cks, *bp);
					if (i == -2) {
					    JUNK(); /* bogus cref */
					    continue;
					}
					if (i < 0) {	/* get more of cref */
					    --bp_len;
					    ++bp;
					    continue;
					}
					bp += i;    /* use complete cref */
					bp_len -= i;
					c = cks->mime_cset[FZ2.cref_cnt];

					if (c == FC_SK)
					    continue;   /* ignore accent mark */
				}

				if (c >= FC_A) {
					++FZ2.btotal;
					if (FZ2.wlen < sizeof(DCC_FUZ2_WORD))
					    FZ2.w.b[FZ2.wlen++] = c;
					else
					    SKIP_WORD();
					continue;
				}

				if (c == FC_SP) {
					if (FZ2.wlen >= MIN_WLEN
					    && FZ2.wlen <=sizeof(DCC_FUZ2_WORD))
					    add_word(cks, lbufs);
					FZ2.wlen = 0;
					DCC_FUZ2_WORD_CLEAR(&FZ2.w);
					continue;
				}
				++FZ2.btotal;

				if (c == FC_LT) {
					FZ2.tag_len = 0;
					DCC_FUZ2_WORD_CLEAR(&FZ2.tag);
					FZ2.st = DCC_FUZ2_ST_START_TAG;
					break;
				}

				JUNK();
			} while (bp_len != 0);
			break;

		case DCC_FUZ2_ST_START_TAG:
			/* collecting an HTML tag or comment
			 * We've passed the '<' */
			c = *bp;
#define SAVE_TAG(_c) (FZ2.tag.b[FZ2.tag_len++] = _c, \
		      ++FZ2.btotal, ++bp, --bp_len)
			if (((c >= 'a' && c <= 'z')	/* tag */
			     || (c >= '0' && c <= '9'))
			    && FZ2.tag_len < sizeof(FZ2.tag)) {
				SAVE_TAG(c);
				break;
			}
			if (c >= 'A' && c <= 'Z'
			    && FZ2.tag_len < sizeof(FZ2.tag)) {
				SAVE_TAG(c - ('A'-'a'));
				break;
			}
			if ((c == '/'	/* end-tag */
			     || c == '!')   /* start of comment */
			    && FZ2.tag_len == 0) {
				SAVE_TAG(c);
				break;
			}
			if (c == '-'    /* comment */
			    && FZ2.tag_len >= 1 && FZ2.tag_len <= 2) {
				SAVE_TAG(c);
				break;
			}
#undef SAVE_TAG

			/* notice an <html> tag while in text/plain
			 * and switch to text/html */
			if (FZ2.tag_len == 4
			    && cks->mime_ct != DCC_CK_CT_HTML
			    && !memcmp(FZ2.tag.b, "html", 4))
				cks->mime_ct = DCC_CK_CT_HTML;

			if (cks->mime_ct == DCC_CK_CT_HTML
			    && FZ2.tag_len > 0) {
				/* if we are in an HTML document and we
				 * have at least one character after '<',
				 * assume it is some kind of HTML tag */
				FZ2.xsummed += FZ2.tag_len+1;	/* count '<' */
				if (c == '>') {
					/* optimize common simple tags */
					++FZ2.xsummed;
					++FZ2.btotal;
					++bp, --bp_len;
					FZ2.st = DCC_FUZ2_ST_WORD;
					break;
				}
				if (FZ2.tag_len >= 3
				    && !memcmp(FZ2.tag.b, "!--", 3)) {
					FZ2.st = DCC_FUZ2_ST_SKIP_COMMENT;
				} else {
					FZ2.url.st = DCC_URL_ST_IDLE;
					FZ2.st = DCC_FUZ2_ST_SKIP_TAG;
				}
			} else {
				/* assume it is not an HTML tag and
				 * mark the whole word as junk */
				JUNK();
			}
			break;

		case DCC_FUZ2_ST_SKIP_TAG:
			/* Skip rest of boring HTML tag
			 * We ought to ignore '>' in quotes */
			do {
				if (FZ2.cref_st == DCC_CREF_ST_IDLE) {
					--bp_len;
					c = *bp++;
					if (c == FC_CF) {
					    FZ2.cref_st = DCC_CREF_ST_START;
					    continue;
					}
				} else {
					i = adv_cref(cks, *bp);
					if (i == -2)
					    continue;
					if (i < 0) {	/* get more of cref */
					    --bp_len;
					    ++bp;
					    continue;
					}
					bp += i;    /* use complete cref */
					bp_len -= i;
					c = FZ2.cref_cnt;
				}

				/* capture URLs */
				i = dcc_ck_url(&FZ2.url, c, &FZ2.url_cp);
				c = i>>DCC_CK_URL_SHIFT;
				switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) {
				case DCC_CK_URL_CHAR:
					break;
				case DCC_CK_URL_CK_LEN:
					/* Make room before starting a URL
					 * if we are too close to
					 * end of buffer for a maximum size URL.
					 * Discard the first URL in the buffer.
					 * This relies on dcc_ck_url() limiting
					 * the URL to DCC_URL_MAX bytes */
					while (FZ2.url_cp
					       >= &FZ2.url_buf[ISZ(FZ2.url_buf)
							- DCC_FUZ2_URL_MAX]) {
					    p = memchr(FZ2.url_buf, '\0',
						       FZ2.url_cp-FZ2.url_buf);
					    if (!p) {
						/* if this was the first URL,
						 * discard half of it */
						p = &FZ2.url_buf[DCC_URL_MAX/2];
					    } else {
						++p;
					    }
					    memmove(FZ2.url_buf, p,
						    FZ2.url_cp - p);
					    FZ2.url_cp -= p - FZ2.url_buf;
					}
					if (FZ2.url_cp != FZ2.url_buf)
					    *FZ2.url_cp++ = '\0';
					++FZ2.urls;
					break;
				case DCC_CK_URL_HOST:
				case DCC_CK_URL_DOT:
					if (FZ2.url_cp
					    < &FZ2.url_buf[ISZ(FZ2.url_buf)])
					    *FZ2.url_cp++ = c;
					continue;
				case DCC_CK_URL_HOST_END:
				case DCC_CK_URL_HOST_RESET:
				case DCC_CK_URL_SKIP:
					continue;
				}

				if (c == '>') {
					++FZ2.xsummed;
					++FZ2.btotal;
					FZ2.st = DCC_FUZ2_ST_WORD;
					break;
				}
				if (cks->mime_cset[c] != FC_SP) {
					++FZ2.xsummed;
					++FZ2.btotal;
					/* don't let wild tags run forever */
					if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
					    JUNK();
					    break;
					}
				}
			} while (bp_len != 0);
			break;

		case DCC_FUZ2_ST_SKIP_COMMENT:
			/* HTML comments can include HTML tags,
			 * but spammers don't understand HTML comment syntax
			 * and Netscape and IE treat (and ignore) broken
			 * comments like strange tags. */
			do {
				--bp_len;
				c = *bp++;
				if (c == '>') {
					++FZ2.xsummed;
					++FZ2.btotal;
					FZ2.st = DCC_FUZ2_ST_WORD;
					break;
				}
				if (cks->mime_cset[c] != FC_SP) {
					++FZ2.xsummed;
					++FZ2.btotal;
					/* don't let wild tags run forever */
					if (++FZ2.tag_len > DCC_URL_FAILSAFE) {
					    JUNK();
					    break;
					}
				}
			} while (bp_len != 0);
			break;
		}
	}
	for (lbp = lbufs, lp = FZ2.lang; lbp <= LAST(lbufs); ++lbp, ++lp) {
		if (lbp->blen != 0) {
			lp->wsummed += lbp->blen;
			FUZ2(lp, lbp->buf, lbp->blen);
		}
	}
#undef SKIP_WORD
#undef JUNK
#undef BUF_LEN
}



void
dcc_ck_fuz2_fin(DCC_GOT_CKS *cks)
{
	FUZ2_LANG *lp, *lp1;

	if (cks->sums[DCC_CK_FUZ2].type != DCC_CK_FUZ2)
		return;

	/* pick the language checksum of the most words */
	lp = FZ2.lang;
	for (lp1 = lp+1; lp1 <= LAST(FZ2.lang); ++lp1) {
		if (lp->wtotal < lp1->wtotal)
			lp = lp1;
	}

#ifdef DCC_DEBUG_CKSUM
	if (dcc_clnt_debug > 3)
		printf("\n***fuz2: wtotal[%d]=%d summed=%d+%d btotal=%d\n",
		       (int)(lp-FZ2.lang),
		       lp->wtotal, lp->wsummed, FZ2.xsummed, FZ2.btotal);
#endif
	/* The FUZ2 checksum is not valid if it is on a few words and
	 * less than 10% of a big, binary file */
	if (lp->wtotal < 100
	    && (lp->wsummed+FZ2.xsummed)*10 < FZ2.btotal) {
		cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
		return;
	}
	/* We cannot compute a checksum on a nearly empty message */
	if (lp->wtotal < 8) {
		if (lp->wtotal + FZ2.urls*4 >= 8) {
			/* use URLs if we lack words */
			FUZ2(lp, FZ2.url_buf, FZ2.url_cp - FZ2.url_buf);
		} else {
			/* Compute a checksum for local blacklising on messages
			 * that look empty to the FUZ2 checksum but are not and
			 * are not too binary. The resulting checksum is zero.
			 * Do not report it to the DCC server. */
			if ((lp->wsummed+FZ2.xsummed) >= 120) {
				memset(cks->sums[DCC_CK_FUZ2].sum, 0,
				       sizeof(cks->sums[DCC_CK_FUZ2].sum));
				return;
			}
			cks->sums[DCC_CK_FUZ2].type = DCC_CK_INVALID;
			return;
		}
	}

	MD5Final(cks->sums[DCC_CK_FUZ2].sum, &lp->md5);
	cks->sums[DCC_CK_FUZ2].rpt2srvr = 1;
}