Mercurial > notdcc

diff dcclib/ckmime.c @ 0:c7f6b056b673
First import of vendor version
author: Peter Gervai <grin@grin.hu>
date: Tue, 10 Mar 2009 13:49:58 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dcclib/ckmime.c	Tue Mar 10 13:49:58 2009 +0100
@@ -0,0 +1,829 @@
+/* Distributed Checksum Clearinghouse
+ *
+ * decode MIME for checksums
+ *
+ * Copyright (c) 2008 by Rhyolite Software, LLC
+ *
+ * This agreement is not applicable to any entity which sells anti-spam
+ * solutions to others or provides an anti-spam solution as part of a
+ * security solution sold to other entities, or to a private network
+ * which employs the DCC or uses data provided by operation of the DCC
+ * but does not provide corresponding data to other users.
+ *
+ * Permission to use, copy, modify, and distribute this software without
+ * changes for any purpose with or without fee is hereby granted, provided
+ * that the above copyright notice and this permission notice appear in all
+ * copies and any distributed versions or copies are either unchanged
+ * or not called anything similar to "DCC" or "Distributed Checksum
+ * Clearinghouse".
+ *
+ * Parties not eligible to receive a license under this agreement can
+ * obtain a commercial license to use DCC by contacting Rhyolite Software
+ * at sales@rhyolite.com.
+ *
+ * A commercial license would be for Distributed Checksum and Reputation
+ * Clearinghouse software.  That software includes additional features.  This
+ * free license for Distributed ChecksumClearinghouse Software does not in any
+ * way grant permision to use Distributed Checksum and Reputation Clearinghouse
+ * software
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
+ * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+ * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Rhyolite Software DCC 1.3.103-1.39 $Revision$
+ */
+
+#include "dcc_ck.h"
+
+/* Notice MIME headers */
+void
+dcc_ck_mime_hdr(DCC_GOT_CKS *cks,
+		  const char *hdr,	/* entire header line or name only */
+		  const char *str)	/* header value if not after name */
+{
+	/* parse at least the header name */
+	cks->mhdr_st = CK_MHDR_ST_CE_CT;
+	cks->mhdr_pos = 0;
+	parse_mime_hdr(cks, hdr, strlen(hdr), 1);
+
+	/* parse the header value if present and we care about the header */
+	if (str
+	    && cks->mhdr_st != CK_MHDR_ST_IDLE) {
+		parse_mime_hdr(cks, ":", 1, 1);
+		parse_mime_hdr(cks, str, strlen(str), 1);
+	}
+
+	/* force the end of the line */
+	if (cks->mhdr_st != CK_MHDR_ST_IDLE)
+		parse_mime_hdr(cks, "\n", 1, 1);
+
+	if (cks->mime_nest != 0)
+		cks->mp_st = CK_MP_ST_PREAMBLE;
+
+	cks->flags |= DCC_CKS_MIME_BOL;
+}
+
+
+
+static u_char				/* 1=matched */
+match(DCC_GOT_CKS *cks,
+      enum CK_MHDR_ST ok, enum CK_MHDR_ST fail,
+      const char *tgt_str, u_int tgt_len,
+      const char **bp, u_int *bp_len)
+{
+	u_int len;
+
+	len = min(tgt_len - cks->mhdr_pos, *bp_len);
+	if (strncasecmp(tgt_str + cks->mhdr_pos, *bp, len)) {
+		/* switch to failure state if there is enough of the
+		 * string to know it does not match */
+		cks->mhdr_st = fail;
+		return 0;
+	}
+
+	*bp += len;
+	*bp_len -= len;
+	if ((u_int)(cks->mhdr_pos += len) >= tgt_len) {
+		/* switch to the success state on a match */
+		cks->mhdr_st = ok;
+		cks->mhdr_pos = 0;
+		return 1;
+	}
+
+	/* wait for more input */
+	return 0;
+}
+
+
+
+/* ignore white space */
+static u_char				/* 0=buffer empty */
+span_ws(const char **bp, u_int *bp_len)
+{
+	char c;
+	while ((c = **bp) == ' ' || c == '\t' || c == '\r' || c == '\n') {
+		++*bp;
+		if (--*bp_len == 0)
+			return 0;
+	}
+	return 1;
+}
+
+
+
+/* skip to white space or after semicolon that precedes the next parameter */
+static u_char				/* 0=buffer empty */
+skip_param(const char **bp, u_int *bp_len)
+{
+	char c;
+	while ((c = **bp) != ' ' && c != '\t' && c != '\r' && c != '\n') {
+		++*bp;
+		if (c == ';') {
+			--*bp_len;
+			return 1;
+		}
+		if (--*bp_len == 0)
+			return 0;
+	}
+	return 1;
+}
+
+
+
+/* Parse MIME headers
+ *	Look for (parts of) Content-Type and Content-Transfer-Encoding
+ *	headers in a buffer.  There can be at most one significant (not part of
+ *	folded whitespace) '\n' in the buffer and only as the last byte */
+u_char					/* 1=blank line */
+parse_mime_hdr(DCC_GOT_CKS *cks,
+	       const char *bp, u_int bp_len,
+	       u_char in_hdrs)		/* 1=in RFC 822 headers */
+{
+#define MMATCH(str,ok,fail) match(cks,CK_MHDR_ST_##ok,CK_MHDR_ST_##fail,    \
+				  str,sizeof(str)-1, &bp, &bp_len)
+	char c;
+	DCC_CK_BND *bndp;
+
+	if ((cks->flags & DCC_CKS_MIME_BOL)
+	    && !in_hdrs) {
+		c = *bp;
+		if (c == '\r') {
+			/* ignore CR to ease detecting blank line */
+			if (--bp_len == 0)
+				return 0;
+			c = *++bp;
+		}
+		if (c == '\n')
+			return 1;	/* this line is blank */
+
+		/* reset parser line without folded whitespace */
+		if (c != ' ' && c != '\t') {
+			cks->mhdr_st = CK_MHDR_ST_CE_CT;
+			cks->mhdr_pos = 0;
+		}
+		cks->flags &= ~DCC_CKS_MIME_BOL;
+	}
+
+	do {
+		switch (cks->mhdr_st) {
+		case CK_MHDR_ST_IDLE:
+			return 0;
+
+		case CK_MHDR_ST_CE_CT:
+			/* This state always preceeds the following states */
+			if (MMATCH("Content-T", CT_WS, IDLE)) {
+				switch (*bp) {
+				case 'r':
+				case 'R':
+					cks->mhdr_st = CK_MHDR_ST_CE;
+					break;
+				case 'y':
+				case 'Y':
+					cks->mhdr_st = CK_MHDR_ST_CT;
+					break;
+				default:
+					cks->mhdr_st = CK_MHDR_ST_IDLE;
+					return 0;
+				}
+			}
+			break;
+
+		case CK_MHDR_ST_CE:
+			MMATCH("ransfer-Encoding:", CE_WS, IDLE);
+			break;
+		case CK_MHDR_ST_CE_WS:
+			if (!span_ws(&bp, &bp_len))
+				return 0;
+			switch (*bp) {
+			case 'b':
+			case 'B':
+				cks->mhdr_st = CK_MHDR_ST_B64;
+				break;
+			case 'q':
+			case 'Q':
+				cks->mhdr_st = CK_MHDR_ST_QP;
+				break;
+			default:
+				cks->mhdr_st = CK_MHDR_ST_IDLE;
+				return 0;
+			}
+			break;
+		case CK_MHDR_ST_QP:
+			if (MMATCH("quoted-printable", IDLE, IDLE))
+				cks->mime_ce = DCC_CK_CE_QP;
+			break;
+		case CK_MHDR_ST_B64:
+			if (MMATCH("base64", IDLE, IDLE))
+				cks->mime_ce = DCC_CK_CE_B64;
+			break;
+
+		case CK_MHDR_ST_CT:
+			MMATCH("ype:", CT_WS, IDLE);
+			break;
+		case CK_MHDR_ST_CT_WS:
+			/* We have matched "Content-type:" */
+			if (!span_ws(&bp, &bp_len))
+				return 0;
+			switch (*bp) {
+			case 't':
+			case 'T':
+				cks->mhdr_st = CK_MHDR_ST_TEXT;
+				break;
+			case 'm':
+			case 'M':
+				/* do not nest too deeply */
+				if (in_hdrs
+				    || cks->mime_nest < DIM(cks->mime_bnd)) {
+					cks->mhdr_st = CK_MHDR_ST_MULTIPART;
+				} else {
+					cks->mhdr_st = CK_MHDR_ST_TEXT;
+					cks->mhdr_st = CK_MHDR_ST_IDLE;
+				}
+				break;
+			default:
+				/* assume it is binary noise if it does
+				 * not match "Content-type: [tTmM]" */
+				cks->mime_ct = DCC_CK_CT_BINARY;
+				cks->mhdr_st = CK_MHDR_ST_IDLE;
+				return 0;
+			}
+			break;
+		case CK_MHDR_ST_TEXT:
+			/* we are looking for "Text" in "Content-type: Text" */
+			if (MMATCH("text", HTML, IDLE))
+				cks->mime_ct = DCC_CK_CT_TEXT;
+			break;
+		case CK_MHDR_ST_HTML:
+			/* look for "Content-type: Text/html" */
+			if (MMATCH("/html", CSET_SKIP_PARAM, CSET_SKIP_PARAM))
+				cks->mime_ct = DCC_CK_CT_HTML;
+			break;
+		case CK_MHDR_ST_CSET_SKIP_PARAM:
+			/* Look for semicolon or whitespace preceding next
+			 * parameter after "Content-type: Text/html" */
+			if (skip_param(&bp, &bp_len))
+				cks->mhdr_st = CK_MHDR_ST_CSET_SPAN_WS;
+			break;
+		case CK_MHDR_ST_CSET_SPAN_WS:
+			/* skip optional whitespace before next parameter */
+			if (span_ws(&bp, &bp_len))
+				cks->mhdr_st = CK_MHDR_ST_CSET;
+			break;
+		case CK_MHDR_ST_CSET:
+			/* have matched "Content-Type: text...;"
+			 * and are looking for a "charset=" parameter */
+			MMATCH("charset=", CSET_ISO_8859, CSET_SKIP_PARAM);
+			break;
+		case CK_MHDR_ST_CSET_ISO_8859:
+			/* We have matched "Content-Type: text...charset="
+			 * and are looking for "ISO-8859-*".
+			 * Ignore leading '"' */
+			if (cks->mhdr_pos == 0
+			    && bp_len > 0 && *bp == '"') {
+				++bp;
+				--bp_len;
+			}
+			MMATCH("iso-8859-", CSET_ISO_X, IDLE);
+			break;
+		case CK_MHDR_ST_CSET_ISO_X:
+			for (;;) {
+				if (bp_len == 0)
+					return 0;
+				--bp_len;
+				c = *bp++;
+				if (c < '0' || c > '9') {
+					if ((c == '"' || c == ' ' || c == '\t'
+					     || c == ';'
+					     || c == '\r' || c == '\n')
+					    && cks->mhdr_pos == 2)
+					    cks->mime_cset = dcc_cset_2;
+					else
+					    cks->mime_cset = dcc_cset_1;
+					cks->mhdr_st = CK_MHDR_ST_IDLE;
+					return 0;
+				}
+				cks->mhdr_pos = cks->mhdr_pos*10 + c - '0';
+				if (cks->mhdr_pos > 99) {
+					cks->mhdr_st = CK_MHDR_ST_IDLE;
+					return 0;
+				}
+			}
+		case CK_MHDR_ST_MULTIPART:
+			/* We are looking for "Content-type: Multipart"
+			 * after having seen "Content-type: M".
+			 * If it is not "ultipart", assume "essage" and that
+			 * it is text. */
+			cks->mhdr_st = CK_MHDR_ST_TEXT;
+			MMATCH("multipart", BND_SKIP_PARAM, IDLE);
+			break;
+		case CK_MHDR_ST_BND_SKIP_PARAM:
+			/* Look for semicolon or whitespace preceding next
+			 * parameter after "Content-type: M" */
+			if (skip_param(&bp, &bp_len))
+				cks->mhdr_st = CK_MHDR_ST_BND_SPAN_WS;
+			break;
+		case CK_MHDR_ST_BND_SPAN_WS:
+			/* skip optional whitespace before next parameter */
+			if (span_ws(&bp, &bp_len))
+				cks->mhdr_st = CK_MHDR_ST_BND;
+			break;
+		case CK_MHDR_ST_BND:
+			/* we have matched "Content-type: multipart"
+			 * and are looking for the "boundary" parameter */
+			if (MMATCH("boundary=", BND_VALUE, BND_SKIP_PARAM)) {
+				if (in_hdrs) {
+					cks->mime_nest = 0;
+					/* allow missing initial blank line */
+					cks->mime_bnd_matches = 1;
+				}
+				bndp = &cks->mime_bnd[cks->mime_nest];
+				cks->flags &= ~DCC_CKS_MIME_QUOTED;
+				bndp->bnd[0] = '-';
+				bndp->bnd[1] = '-';
+				cks->mhdr_pos = 2;
+			}
+			break;
+		case CK_MHDR_ST_BND_VALUE:
+			/* collect the bounary string */
+			bndp = &cks->mime_bnd[cks->mime_nest];
+			/* this accepts a lot more than RFC 2046 allows,
+			 * but spamware written by idiots doesn't comply */
+			for (;;) {
+				if (bp_len == 0)
+					return 0;
+				--bp_len;
+				c = *bp++;
+				if (c == '\n')
+					break;
+				if (c == '\r')
+					continue;
+				if ((c == ' ' || c == '\t' || c == ';')
+				    && !(cks->flags & DCC_CKS_MIME_QUOTED))
+					break;
+				if (c == '"') {
+					cks->flags ^= DCC_CKS_MIME_QUOTED;
+					continue;
+				}
+				bndp->bnd[cks->mhdr_pos] = c;
+				if (++cks->mhdr_pos >= DCC_CK_BND_MAX) {
+					cks->mhdr_st = CK_MHDR_ST_IDLE;
+					return 0;
+				}
+			}
+			bndp->bnd_len = cks->mhdr_pos;
+			bndp->cmp_len = 0;
+			++cks->mime_nest;
+			cks->mhdr_st = CK_MHDR_ST_IDLE;
+			break;
+		}
+	} while (bp_len != 0);
+	return 0;
+
+#undef MMATCH
+#undef MKSIP_WS
+}
+
+
+
+/* fetch bytes and convert from quoted-printable */
+u_int					/* output length */
+dcc_ck_qp_decode(DCC_GOT_CKS *cks, const char **ibufp, u_int *ibuf_lenp,
+		 char *obuf, u_int obuf_len)
+{
+#	define GC(c) do {if (!ibuf_len) return result;		\
+	--ibuf_len; (c) = *ibuf; ++ibuf;} while (0)
+	u_int ibuf_len, result;
+	const char *ibuf;
+	u_char c = 0;
+
+	if (obuf_len == 0)
+		return 0;
+	ibuf_len = *ibuf_lenp;
+	ibuf = *ibufp;
+	result = 0;
+	while (ibuf_len != 0) {
+		switch (cks->qp.state) {
+		case DCC_CK_QP_IDLE:
+			GC(c);
+			if (c != '=')
+				break;
+			cks->qp.state = DCC_CK_QP_EQ;
+			continue;
+
+		case DCC_CK_QP_EQ:
+			/* Consider first character after '=' */
+			GC(c);
+			cks->qp.x = c;
+			if (c == '\r') {
+				;
+			} else if (c == '\n') {
+				/* delete "=\n" like "=\r\n"
+				 * so that dccproc and dccm agree */
+				cks->qp.state = DCC_CK_QP_IDLE;
+				continue;
+			} else if (c >= '0' && c <= '9') {
+				cks->qp.n = c-'0';
+			} else if (c >= 'a' && c <= 'f') {
+				cks->qp.n = c-('a'-10);
+			} else if (c >= 'A' && c <= 'F') {
+				cks->qp.n = c-('A'-10);
+			} else {
+				cks->qp.state = DCC_CK_QP_FAIL1;
+				c = '=';
+				break;
+			}
+			cks->qp.state = DCC_CK_QP_1;
+			continue;
+
+		case DCC_CK_QP_1:
+			/* consider second character after '=' */
+			GC(c);
+			cks->qp.y = c;
+			if (cks->qp.x == '\r') {
+				if (c == '\n') {
+					/* delete soft line-break */
+					cks->qp.state = DCC_CK_QP_IDLE;
+					continue;
+				}
+				cks->qp.state = DCC_CK_QP_FAIL2;
+				c = '=';
+				break;
+			} else if (c >= '0' && c <= '9') {
+				c -= '0';
+			} else if (c >= 'a' && c <= 'f') {
+				c -= ('a'-10);
+			} else if (c >= 'A' && c <= 'F') {
+				c -= ('A'-10);
+			} else {
+				cks->qp.state = DCC_CK_QP_FAIL2;
+				c = '=';
+				break;
+			}
+			cks->qp.state = DCC_CK_QP_IDLE;
+			c = (cks->qp.n << 4) | c;
+			break;
+
+		case DCC_CK_QP_FAIL1:
+			/* output character after '=' of a 2-character
+			 * sequence that was not quoted-printable after all */
+			cks->qp.state = DCC_CK_QP_IDLE;
+			c = cks->qp.x;
+			break;
+
+		case DCC_CK_QP_FAIL2:
+			/* output character after '=' of a 3-character
+			 * sequence that was not quoted-printable after all */
+			cks->qp.state = DCC_CK_QP_FAIL3;
+			c = cks->qp.x;
+			break;
+
+		case DCC_CK_QP_FAIL3:
+			/* output third character of a 3-character
+			 * sequence that was not quoted-printable after all */
+			cks->qp.state = DCC_CK_QP_IDLE;
+			c = cks->qp.y;
+			break;
+		}
+
+		*obuf++ = c;
+		if (++result >= obuf_len)
+			break;
+	}
+	*ibuf_lenp = ibuf_len;
+	*ibufp = ibuf;
+	return result;
+#undef GC
+}
+
+
+
+
+#define B64B	0100			/* bad */
+#define B64EQ	0101			/* '=' */
+static u_char base64_decode[128] = {
+    B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x00 */
+    B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x08 */
+    B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x10 */
+    B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x18 */
+
+    B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x20   ! " # $ % & ' */
+    B64B, B64B, B64B, 62,   B64B, B64B, B64B, 63,   /* 0x28 ( ) * + , - . / */
+
+    52,	  53,	54,   55,   56,   57,   58,   59,   /* 0x30 0 1 2 3 4 5 6 7 */
+    60,   61,   B64B, B64B, B64B, B64EQ,B64B, B64B, /* 0x38 8 9 : ; < = > ? */
+
+    B64B, 0,    1,    2,    3,    4,    5,    6,    /* 0x40 @ A B C D E F G */
+    7,    8,    9,    10,   11,   12,   13,   14,   /* 0x48 H I J K L M N O */
+
+    15,   16,   17,   18,   19,   20,   21,   22,   /* 0x50 P Q R S T U V W */
+    23,   24,   25,   B64B, B64B, B64B, B64B, B64B, /* 0x58 X Y Z [ \ ] ^ _ */
+
+    B64B, 26,   27,   28,   29,   30,   31,   32,   /* 0x60 ` a b c d e f g */
+    33,   34,   35,   36,   37,   38,   39,   40,   /* 0x68 h i j k l m n o */
+
+    41,   42,   43,   44,   45,   46,   47,   48,   /* 0x70 p q r s t u v w */
+    49,   50,   51,   B64B, B64B, B64B, B64B, B64B, /* 0x78 x y z { | } ~ del */
+};
+
+u_int					/* output length */
+dcc_ck_b64_decode(DCC_GOT_CKS *cks, const char **ibufp, u_int *ibuf_lenp,
+	      char *obuf, u_int obuf_len)
+{
+	u_char c;
+	const char *ibuf;
+	u_int ibuf_len, result;
+
+	if (obuf_len < 3)
+		return 0;
+	obuf_len -= 3;
+	ibuf_len = *ibuf_lenp;
+	ibuf = *ibufp;
+	result = 0;
+	while (ibuf_len != 0) {
+		--ibuf_len;
+		c = *ibuf++;
+		c = base64_decode[c];
+		if (c == B64B)
+			continue;
+
+		if (c == B64EQ) {
+			switch (cks->b64.quantum_cnt) {
+			case 2:
+				*obuf++ = cks->b64.quantum>>4;
+				++result;
+				break;
+			case 3:
+				*obuf++ = cks->b64.quantum>>10;
+				*obuf++ = cks->b64.quantum>>2;
+				result += 2;
+				break;
+			}
+			cks->b64.quantum_cnt = 0;
+			if (result >= obuf_len)
+				break;
+		}
+
+		cks->b64.quantum = (cks->b64.quantum << 6) | c;
+		if (++cks->b64.quantum_cnt >= 4) {
+			cks->b64.quantum_cnt = 0;
+			*obuf++ = cks->b64.quantum>>16;
+			*obuf++ = cks->b64.quantum>>8;
+			*obuf++ = cks->b64.quantum;
+			result += 3;
+			if (result >= obuf_len)
+				break;
+		}
+	}
+	*ibuf_lenp = ibuf_len;
+	*ibufp = ibuf;
+	return result;
+}
+
+
+
+/* skip parts of URLs */
+int
+dcc_ck_url(DCC_URL_SKIP *url, char c, char **pbufp)
+{
+#define RET_C(s) return ((c<<DCC_CK_URL_SHIFT) | s)
+
+	/* Continue skipping a URL to its end.
+	 * Assume the end is the next blank, comma, '>', or '\n'
+	 * unless the URL is quoted.  Then continue to the quote
+	 * or until the length has become silly. */
+
+	/* convert ASCII upper to lower case */
+	if (c >= 'A' && c <= 'Z')
+		c -= 'A' - 'a';
+
+	switch (url->st) {
+	case DCC_URL_ST_IDLE:
+		if (c == 'h') {
+			/* start looking for 't' after 'h' in "http" */
+			url->flags = 0;
+			url->st = DCC_URL_ST_T1;
+		} else if (c == '=') {
+			/* look for the '=' in "href=" or "img src=" */
+			url->st = DCC_URL_ST_QUOTE;
+		}
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_QUOTE:
+		/* look for '"' or 'H' after "href=" or "img src= */
+		if (c == 'h') {
+			url->flags &= ~DCC_URL_QUOTES;
+			url->st = DCC_URL_ST_T1;
+		} else if (c == '"') {
+			url->flags |= DCC_URL_DQUOTED;
+			url->st = DCC_URL_ST_QH;
+		} else if (c == '\'') {
+			url->flags |= DCC_URL_SQUOTED;
+			url->st = DCC_URL_ST_QH;
+		} else {
+			url->st = DCC_URL_ST_IDLE;
+		}
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_QH:
+		/* seen quote; looking for start of URL */
+		if (c == 'h') {
+			url->st = DCC_URL_ST_T1;
+		} else {
+			url->st = DCC_URL_ST_IDLE;
+		}
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_T1:
+		if (c == 't')
+			url->st = DCC_URL_ST_T2;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_T2:
+		if (c == 't')
+			url->st = DCC_URL_ST_P;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_P:
+		if (c == 'p')
+			url->st = DCC_URL_ST_S;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_S:
+		/* we are expecting the ':' or 's' after http */
+		if (c == 's')
+			url->st = DCC_URL_ST_COLON;
+		else if (c == ':')
+			url->st = DCC_URL_ST_SLASH1;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_COLON:
+		/* we are expecting the ':' after http or https */
+		if (c == ':')
+			url->st = DCC_URL_ST_SLASH1;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_SLASH1:
+		/* we are expecting the first '/' after http: */
+		if (c == '/')
+			url->st = DCC_URL_ST_SLASH2;
+		else
+			url->st = DCC_URL_ST_IDLE;
+		RET_C(DCC_CK_URL_CHAR);
+
+	case DCC_URL_ST_SLASH2:
+		/* we are expecting the second '/' after http:/" */
+		if (c != '/') {
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_CHAR);
+		}
+		url->st = DCC_URL_ST_SLASH3_START;
+		RET_C(DCC_CK_URL_CK_LEN);
+
+	case DCC_URL_ST_SLASH3_START:
+		url->dot = 0;
+		url->start = *pbufp;
+		url->total = 0;
+		url->flags &= ~(DCC_URL_DEL_DOMAIN
+				| DCC_URL_PERCENT1 | DCC_URL_PERCENT2);
+		url->st = DCC_URL_ST_SLASH3;
+		/* fall into DCC_URL_ST_SLASH3 */
+	case DCC_URL_ST_SLASH3:
+		/* look for the end of the host name */
+		++url->total;
+again:
+		if (c == '.') {
+			/* keep only 1st and 2nd level domain names */
+			url->flags &= ~DCC_URL_DEL_DOMAIN;
+			if (!url->dot) {
+				/* do nothing at first '.' unless the name
+				 * was too long */
+				if (*pbufp >= url->start+DCC_URL_MAX) {
+					*pbufp = url->start;
+				} else {
+					url->dot = *pbufp;
+				}
+			} else {
+				url->flags |= DCC_URL_DEL_DOMAIN;
+			}
+			RET_C(DCC_CK_URL_DOT);
+		}
+		/* delay deleting third level domains to not be
+		 * fooled by a trailing dot */
+		if (url->flags & DCC_URL_DEL_DOMAIN) {
+			url->flags &= ~DCC_URL_DEL_DOMAIN;
+			memmove(url->start, url->dot,
+				*pbufp - url->dot);
+			*pbufp -= (url->dot - url->start);
+			url->dot = *pbufp;
+		}
+
+		if (c == '/') {
+			url->st = DCC_URL_ST_SKIP;
+			RET_C(DCC_CK_URL_HOST_END);
+		}
+		if (c == '"' && (url->flags & DCC_URL_DQUOTED)) {
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_HOST_END);
+		}
+		if (c == '\'' && (url->flags & DCC_URL_SQUOTED)) {
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_HOST_END);
+		}
+		if ((c == '<' || c == '>')
+		    && (url->flags & DCC_URL_QUOTES) == 0) {
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_HOST_END);
+		}
+		if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
+			if (!(url->flags & DCC_URL_QUOTED)
+			    || url->total > DCC_URL_FAILSAFE) {
+				url->st = DCC_URL_ST_IDLE;
+				RET_C(DCC_CK_URL_HOST_END);
+			}
+			/* whitespace in a URL hostname is at best username */
+			*pbufp = url->start;
+			url->st = DCC_URL_ST_SLASH3_START;
+			RET_C(DCC_CK_URL_HOST_RESET);
+		}
+		if (c == '@') {
+			/* ignore username and password */
+			*pbufp = url->start;
+			url->st = DCC_URL_ST_SLASH3_START;
+			RET_C(DCC_CK_URL_HOST_RESET);
+		}
+
+		if (c == '%') {
+			url->flags &= ~DCC_URL_PERCENT2;
+			url->flags |= DCC_URL_PERCENT1;
+			RET_C(DCC_CK_URL_SKIP);
+		}
+		if (url->flags & DCC_URL_PERCENT1) {
+			if (c >= '0' && c <= '9') {
+				c -= '0';
+			} else if (c >= 'a' && c <= 'f') {
+				c -= 'a'-10;
+			} else {
+				*pbufp = url->start;
+				url->st = DCC_URL_ST_SLASH3_START;
+				RET_C(DCC_CK_URL_HOST_RESET);
+			}
+			if (url->flags & DCC_URL_PERCENT2) {
+				url->flags &= ~(DCC_URL_PERCENT1
+						| DCC_URL_PERCENT2);
+				c |= url->percent;
+				if (c >= 'A' && c <= 'Z')
+					c -= 'A' - 'a';
+				goto again;
+			}
+			url->percent = c << 4;
+			url->flags |= DCC_URL_PERCENT2;
+			RET_C(DCC_CK_URL_SKIP);
+		}
+
+		if (*pbufp >= url->start+DCC_URL_MAX) {
+			/* long garbage is probably a username */
+			if (url->total > DCC_URL_FAILSAFE) {
+				url->st = DCC_URL_ST_IDLE;
+				RET_C(DCC_CK_URL_CHAR);
+			}
+			RET_C(DCC_CK_URL_SKIP);
+		}
+		RET_C(DCC_CK_URL_HOST);
+
+	case DCC_URL_ST_SKIP:
+		/* skip the rest of the URL */
+		++url->total;
+		if (c == '"' || c == '\'') {
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_SKIP);
+		}
+		if ((c == '>' || c == ' ' || c == '\t'
+		    || c == '\n' || c == '\r')
+		    && (!(url->flags & DCC_URL_QUOTES)
+			|| url->total > DCC_URL_FAILSAFE)) {
+			url->total = 0;
+			url->st = DCC_URL_ST_IDLE;
+			RET_C(DCC_CK_URL_CHAR);
+		}
+		RET_C(DCC_CK_URL_SKIP);
+	}
+	RET_C(DCC_CK_URL_CHAR);
+
+#undef RET_C
+}
author	Peter Gervai <grin@grin.hu>
date	Tue, 10 Mar 2009 13:49:58 +0100
parents
children