diff dcclib/ckbody.c @ 0:c7f6b056b673

First import of vendor version
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 13:49:58 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dcclib/ckbody.c	Tue Mar 10 13:49:58 2009 +0100
@@ -0,0 +1,435 @@
+/* Distributed Checksum Clearinghouse
+ *
+ * compute simple body checksum
+ *
+ * Copyright (c) 2008 by Rhyolite Software, LLC
+ *
+ * This agreement is not applicable to any entity which sells anti-spam
+ * solutions to others or provides an anti-spam solution as part of a
+ * security solution sold to other entities, or to a private network
+ * which employs the DCC or uses data provided by operation of the DCC
+ * but does not provide corresponding data to other users.
+ *
+ * Permission to use, copy, modify, and distribute this software without
+ * changes for any purpose with or without fee is hereby granted, provided
+ * that the above copyright notice and this permission notice appear in all
+ * copies and any distributed versions or copies are either unchanged
+ * or not called anything similar to "DCC" or "Distributed Checksum
+ * Clearinghouse".
+ *
+ * Parties not eligible to receive a license under this agreement can
+ * obtain a commercial license to use DCC by contacting Rhyolite Software
+ * at sales@rhyolite.com.
+ *
+ * A commercial license would be for Distributed Checksum and Reputation
+ * Clearinghouse software.  That software includes additional features.  This
+ * free license for Distributed ChecksumClearinghouse Software does not in any
+ * way grant permision to use Distributed Checksum and Reputation Clearinghouse
+ * software
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
+ * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+ * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Rhyolite Software DCC 1.3.103-1.56 $Revision$
+ */
+
+#include "dcc_ck.h"
+
+
+void
+dcc_ck_body0(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
+{
+#	define BUF_LEN 1024
+	char buf[BUF_LEN+5];
+	u_char flen;
+	int blen;
+	char c;
+
+	if (cks->sums[DCC_CK_BODY].type != DCC_CK_BODY)
+		return;
+
+	flen = cks->ctx_body.flen;
+	blen = 0;
+	for (;;) {
+		if (bp_len == 0) {
+			if (blen != 0) {
+				cks->ctx_body.total += blen;
+				MD5Update(&cks->ctx_body.md5, buf, blen);
+			}
+			cks->ctx_body.flen = flen;
+			return;
+		}
+		--bp_len;
+		c = *bp++;
+
+		/* Ignore the '>' in the sequence "\n>From" because
+		 * it is sometimes added for old UNIX MUAs.
+		 * As a side effect, ignore '\n' */
+		if (flen != 0) {
+			if (c == "\n>From"[flen]) {
+				if (++flen >= 6) {
+					memcpy(&buf[blen], "From", 4);
+					if ((blen += 4) >= BUF_LEN) {
+					    cks->ctx_body.total += blen;
+					    MD5Update(&cks->ctx_body.md5, buf,
+						      blen);
+					    blen = 0;
+					}
+					flen = 0;
+				}
+				continue;
+			}
+			if (--flen != 0) {
+				memcpy(&buf[blen], ">From", flen);
+				if ((blen += flen) >= BUF_LEN) {
+					cks->ctx_body.total += blen;
+					MD5Update(&cks->ctx_body.md5, buf,
+						  blen);
+					blen = 0;
+				}
+				flen = 0;
+			}
+		}
+		if (c == '\n') {
+			flen = 1;
+			continue;
+		}
+
+		/* Ignore whitespace to avoid being confused by
+		 * varying line endings added and removed by
+		 * various MUAs and MTAs.
+		 * As a side effect, ignore entirely blank messages. */
+		if (c == ' ' || c == '\t' || c == '\r')
+			continue;
+
+		/* Ignore '=' to minimize but not entirely avoid being
+		 * confused by some some sequences that look like
+		 * quoted-printable triples but that are not.
+		 */
+		if (c == '=')
+			continue;
+
+		buf[blen] = c;
+		if (++blen >= BUF_LEN) {
+			cks->ctx_body.total += blen;
+			MD5Update(&cks->ctx_body.md5, buf, blen);
+			blen = 0;
+		}
+	}
+}
+
+
+
+static void
+dcc_ck_body0_fin(DCC_GOT_CKS *cks)
+{
+	/* always generate the MD5 checksum so that grey listing has it */
+	MD5Final(cks->sums[DCC_CK_BODY].sum, &cks->ctx_body.md5);
+
+	if (cks->sums[DCC_CK_BODY].type != DCC_CK_BODY)
+		return;
+
+	if (cks->ctx_body.total < 30) {
+		cks->sums[DCC_CK_BODY].type = DCC_CK_INVALID;
+		return;
+	}
+
+	cks->sums[DCC_CK_BODY].rpt2srvr = 1;
+}
+
+
+
+static void
+decoders_init(DCC_GOT_CKS *cks)
+{
+	cks->mime_bnd_matches = 0;
+
+	cks->flags |= DCC_CKS_MIME_BOL;
+	cks->mime_ct = DCC_CK_CT_TEXT;
+	cks->mime_cset = dcc_cset_1;
+	cks->mime_ce = DCC_CK_CE_ASCII;
+	cks->qp.state = DCC_CK_QP_IDLE;
+	cks->b64.quantum_cnt = 0;
+}
+
+
+
+/* start all of the checksums */
+void
+dcc_cks_init(DCC_GOT_CKS *cks)
+{
+	DCC_GOT_SUM *g;
+
+	for (g = cks->sums; g <= LAST(cks->sums); ++g) {
+		CLR_GOT_SUM(g);
+	}
+
+	cks->flags = 0;
+	cks->mime_nest = 0;
+	cks->mhdr_st = CK_MHDR_ST_IDLE;
+	cks->mp_st = CK_MP_ST_TEXT;
+	decoders_init(cks);
+
+	cks->sums[DCC_CK_BODY].type = DCC_CK_BODY;
+	cks->ctx_body.total = 0;
+	cks->ctx_body.flen = 1;
+	MD5Init(&cks->ctx_body.md5);
+
+	dcc_ck_fuz1_init(cks);
+	dcc_ck_fuz2_init(cks);
+}
+
+
+
+/* decode quoted-printable and base64 and then compute the body checksums */
+static void
+decode_sum(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
+{
+	char tbuf[1024];
+	const char *tbufp;
+	int len;
+
+	/* Decode quoted-printable and base64 and make fuzzy sumes
+	 * only while in the body of a MIME entity.
+	 * Changing from the text, image, html, etc. requires a '\n'
+	 * to flush the URL and other decoders in the checksummers.
+	 * None of the checksums count whitespace. */
+	if (cks->mp_st != CK_MP_ST_TEXT) {
+		if (bp_len == 0)
+			return;
+#ifdef DCC_DEBUG_CKSUM
+		if (dcc_clnt_debug == 4)
+			write(1, bp, bp_len);
+#endif
+		dcc_ck_body0(cks, bp, bp_len);
+		dcc_ck_fuz1(cks, "\n", 1);
+		dcc_ck_fuz2(cks, "\n", 1);
+		return;
+	}
+
+	while (bp_len != 0) {
+		switch (cks->mime_ce) {
+		case DCC_CK_CE_ASCII:
+		default:
+			len = bp_len;
+			tbufp = bp;
+			bp_len = 0;
+			break;
+		case DCC_CK_CE_QP:
+			tbufp = tbuf;
+			len = dcc_ck_qp_decode(cks, &bp, &bp_len,
+					       tbuf, sizeof(tbuf));
+			break;
+		case DCC_CK_CE_B64:
+			tbufp = tbuf;
+			len = dcc_ck_b64_decode(cks, &bp, &bp_len,
+						tbuf, sizeof(tbuf));
+			break;
+		}
+
+		if (len != 0) {
+#ifdef DCC_DEBUG_CKSUM
+			if (dcc_clnt_debug == 4)
+				write(1, tbufp, len);
+#endif
+			dcc_ck_body0(cks, tbufp, len);
+			dcc_ck_fuz1(cks, tbufp, len);
+			if (cks->mime_ct != DCC_CK_CT_BINARY)
+				dcc_ck_fuz2(cks, tbufp, len);
+		}
+	}
+}
+
+
+
+/* compute all of the body checksums on a chunk of raw text */
+void
+dcc_ck_body(DCC_GOT_CKS *cks, const void *bp, u_int bp_len)
+{
+	DCC_CK_BND *bndp;
+	const char *sum;		/* 1st input byte not swallowed */
+	const char *cmp;		/* 1st not parsed for MIME */
+	const char *cp;
+	char c;
+	int len, matched_len, i, j;
+
+	sum = bp;
+	cmp = sum;
+	while (bp_len != 0) {
+		/* if we have no multipart hassles
+		 * then pass buffer to qp/base64 decoder and quit */
+		if (cks->mime_nest == 0) {
+			decode_sum(cks, sum, bp_len);
+			return;
+		}
+
+		/* look for start of next line to start matching boundaries */
+		if (cks->mime_bnd_matches == 0) {
+			cp = memchr(cmp, '\n', bp_len);
+			if (!cp) {
+				cp = cmp+bp_len;
+			} else {
+				++cp;
+			}
+
+			/* look for a MIME entity header in the text before
+			 * the next line and possible start of a boundary */
+			i = cp - cmp;
+			if (cks->mp_st == CK_MP_ST_HDRS) {
+				if (parse_mime_hdr(cks, cmp, i, 0)) {
+					/* blank header line ends the headers */
+					j = cp-sum;
+					if (j) {
+					    decode_sum(cks, sum, j);
+					    sum = cp;
+					}
+					cks->mp_st = CK_MP_ST_TEXT;
+				}
+			}
+			/* We found the end of a line.  Reset positions to
+			 * start looking for a MIME boundary after it */
+			if (*(cp-1) == '\n') {
+				cks->flags |= DCC_CKS_MIME_BOL;
+				cks->mime_bnd_matches = cks->mime_nest;
+				for (bndp = cks->mime_bnd;
+				     bndp <= LAST(cks->mime_bnd);
+				     ++bndp) {
+					bndp->cmp_len = 0;
+				}
+			}
+			cmp = cp;
+			if ((bp_len -= i) == 0)
+				break;
+		}
+
+		/* look for (rest of) one of the active MIME boundaries */
+		matched_len = 0;
+		for (bndp = cks->mime_bnd;
+		     bndp < &cks->mime_bnd[cks->mime_nest];
+		     ++bndp) {
+
+			if (bndp->cmp_len == DCC_CK_BND_MISS)
+				continue;   /* already mismatched boundary */
+
+			j = bndp->bnd_len - bndp->cmp_len;
+			len = bp_len;
+			if (j > len)
+				j = len;
+			cp = cmp;
+			if (j > 0) {
+				if (memcmp(cp, &bndp->bnd[bndp->cmp_len], j)) {
+					bndp->cmp_len = DCC_CK_BND_MISS;
+					--cks->mime_bnd_matches;
+					continue;
+				}
+				/* this boundary matches so far */
+				bndp->cmp_len += j;
+				cp += j;
+				if ((len -= j) <= 0) {
+					matched_len = bp_len;
+					continue;
+				}
+				/* since we did not exhaust len, we know
+				 * we matched the entire boundary */
+				j = 0;
+			}
+
+			/* look for 1st '-' of trailing "--" */
+			if (j == 0
+			    && *cp == '-') {
+				++bndp->cmp_len;
+				if (--len <= 0) {
+					matched_len = bp_len;
+					continue;
+				}
+				++cp;
+				j = -1;
+			}
+			/* look for 2nd '-' of trailing "--" */
+			if (j == -1) {
+				if (*cp == '-') {
+					++bndp->cmp_len;
+					if (--len <= 0) {
+					    matched_len = bp_len;
+					    continue;
+					}
+					++cp;
+				} else {
+					bndp->cmp_len = DCC_CK_BND_MISS;
+					--cks->mime_bnd_matches;
+					continue;
+				}
+			}
+			/* check for trailing whitespace & '\n' */
+			if ((c = *cp) == ' ' || c == '\t' || c == '\r') {
+				do {
+					++cp;
+				} while (--len > 0
+					 && ((c = *cp) == ' ' || c == '\t'
+					     || c == '\r'));
+				if (len <= 0) {
+					matched_len = bp_len;
+					continue;
+				}
+			}
+			if (*cp != '\n') {
+				/* mismatch after the end of the boundary */
+				bndp->cmp_len = DCC_CK_BND_MISS;
+				--cks->mime_bnd_matches;
+				continue;
+			}
+
+			/* We have found a MIME boundary.
+			 * Flush b64 & qp decoders and fuzzy checksummers */
+			j = cmp-sum;
+			if (j)
+				decode_sum(cks, sum, j);
+
+			/* pass the boundary in the buffer */
+			matched_len = ++cp - cmp;
+			cmp = sum = cp;
+
+			/* Body checksum the boundary */
+			cks->mp_st = CK_MP_ST_BND;
+			decode_sum(cks, bndp->bnd, bndp->bnd_len);
+			if (bndp->cmp_len != bndp->bnd_len) {
+				/* checksum trailing "--" of final boundary */
+				decode_sum(cks, "--", 2);
+				/* end the current & inner entities */
+				cks->mp_st = CK_MP_ST_EPILOGUE;
+			} else {
+				/* intermediate boundaries end inner entities */
+				cks->mp_st = CK_MP_ST_HDRS;
+				++bndp;
+			}
+			cks->mime_nest = bndp - cks->mime_bnd;
+			decoders_init(cks);
+			break;
+		}
+		bp_len -= matched_len;
+	}
+
+	j = cmp-sum;
+	if (j)
+		decode_sum(cks, sum, j);
+}
+
+
+
+/* finish all of the body checksums */
+void
+dcc_cks_fin(DCC_GOT_CKS *cks)
+{
+	dcc_ck_fuz1(cks, "\n", 1);	/* flush URL decoders & line buffers */
+	dcc_ck_fuz2(cks, "\n", 1);
+
+	dcc_ck_body0_fin(cks);
+	dcc_ck_fuz1_fin(cks);
+	dcc_ck_fuz2_fin(cks);
+}