comparison dcclib/ckmime.c @ 0:c7f6b056b673

First import of vendor version
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 13:49:58 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c7f6b056b673
1 /* Distributed Checksum Clearinghouse
2 *
3 * decode MIME for checksums
4 *
5 * Copyright (c) 2008 by Rhyolite Software, LLC
6 *
7 * This agreement is not applicable to any entity which sells anti-spam
8 * solutions to others or provides an anti-spam solution as part of a
9 * security solution sold to other entities, or to a private network
10 * which employs the DCC or uses data provided by operation of the DCC
11 * but does not provide corresponding data to other users.
12 *
13 * Permission to use, copy, modify, and distribute this software without
14 * changes for any purpose with or without fee is hereby granted, provided
15 * that the above copyright notice and this permission notice appear in all
16 * copies and any distributed versions or copies are either unchanged
17 * or not called anything similar to "DCC" or "Distributed Checksum
18 * Clearinghouse".
19 *
20 * Parties not eligible to receive a license under this agreement can
21 * obtain a commercial license to use DCC by contacting Rhyolite Software
22 * at sales@rhyolite.com.
23 *
24 * A commercial license would be for Distributed Checksum and Reputation
25 * Clearinghouse software. That software includes additional features. This
26 * free license for Distributed ChecksumClearinghouse Software does not in any
27 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
28 * software
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
31 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
33 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
34 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
35 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
36 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
37 * SOFTWARE.
38 *
39 * Rhyolite Software DCC 1.3.103-1.39 $Revision$
40 */
41
42 #include "dcc_ck.h"
43
44 /* Notice MIME headers */
45 void
46 dcc_ck_mime_hdr(DCC_GOT_CKS *cks,
47 const char *hdr, /* entire header line or name only */
48 const char *str) /* header value if not after name */
49 {
50 /* parse at least the header name */
51 cks->mhdr_st = CK_MHDR_ST_CE_CT;
52 cks->mhdr_pos = 0;
53 parse_mime_hdr(cks, hdr, strlen(hdr), 1);
54
55 /* parse the header value if present and we care about the header */
56 if (str
57 && cks->mhdr_st != CK_MHDR_ST_IDLE) {
58 parse_mime_hdr(cks, ":", 1, 1);
59 parse_mime_hdr(cks, str, strlen(str), 1);
60 }
61
62 /* force the end of the line */
63 if (cks->mhdr_st != CK_MHDR_ST_IDLE)
64 parse_mime_hdr(cks, "\n", 1, 1);
65
66 if (cks->mime_nest != 0)
67 cks->mp_st = CK_MP_ST_PREAMBLE;
68
69 cks->flags |= DCC_CKS_MIME_BOL;
70 }
71
72
73
74 static u_char /* 1=matched */
75 match(DCC_GOT_CKS *cks,
76 enum CK_MHDR_ST ok, enum CK_MHDR_ST fail,
77 const char *tgt_str, u_int tgt_len,
78 const char **bp, u_int *bp_len)
79 {
80 u_int len;
81
82 len = min(tgt_len - cks->mhdr_pos, *bp_len);
83 if (strncasecmp(tgt_str + cks->mhdr_pos, *bp, len)) {
84 /* switch to failure state if there is enough of the
85 * string to know it does not match */
86 cks->mhdr_st = fail;
87 return 0;
88 }
89
90 *bp += len;
91 *bp_len -= len;
92 if ((u_int)(cks->mhdr_pos += len) >= tgt_len) {
93 /* switch to the success state on a match */
94 cks->mhdr_st = ok;
95 cks->mhdr_pos = 0;
96 return 1;
97 }
98
99 /* wait for more input */
100 return 0;
101 }
102
103
104
105 /* ignore white space */
106 static u_char /* 0=buffer empty */
107 span_ws(const char **bp, u_int *bp_len)
108 {
109 char c;
110 while ((c = **bp) == ' ' || c == '\t' || c == '\r' || c == '\n') {
111 ++*bp;
112 if (--*bp_len == 0)
113 return 0;
114 }
115 return 1;
116 }
117
118
119
120 /* skip to white space or after semicolon that precedes the next parameter */
121 static u_char /* 0=buffer empty */
122 skip_param(const char **bp, u_int *bp_len)
123 {
124 char c;
125 while ((c = **bp) != ' ' && c != '\t' && c != '\r' && c != '\n') {
126 ++*bp;
127 if (c == ';') {
128 --*bp_len;
129 return 1;
130 }
131 if (--*bp_len == 0)
132 return 0;
133 }
134 return 1;
135 }
136
137
138
139 /* Parse MIME headers
140 * Look for (parts of) Content-Type and Content-Transfer-Encoding
141 * headers in a buffer. There can be at most one significant (not part of
142 * folded whitespace) '\n' in the buffer and only as the last byte */
143 u_char /* 1=blank line */
144 parse_mime_hdr(DCC_GOT_CKS *cks,
145 const char *bp, u_int bp_len,
146 u_char in_hdrs) /* 1=in RFC 822 headers */
147 {
148 #define MMATCH(str,ok,fail) match(cks,CK_MHDR_ST_##ok,CK_MHDR_ST_##fail, \
149 str,sizeof(str)-1, &bp, &bp_len)
150 char c;
151 DCC_CK_BND *bndp;
152
153 if ((cks->flags & DCC_CKS_MIME_BOL)
154 && !in_hdrs) {
155 c = *bp;
156 if (c == '\r') {
157 /* ignore CR to ease detecting blank line */
158 if (--bp_len == 0)
159 return 0;
160 c = *++bp;
161 }
162 if (c == '\n')
163 return 1; /* this line is blank */
164
165 /* reset parser line without folded whitespace */
166 if (c != ' ' && c != '\t') {
167 cks->mhdr_st = CK_MHDR_ST_CE_CT;
168 cks->mhdr_pos = 0;
169 }
170 cks->flags &= ~DCC_CKS_MIME_BOL;
171 }
172
173 do {
174 switch (cks->mhdr_st) {
175 case CK_MHDR_ST_IDLE:
176 return 0;
177
178 case CK_MHDR_ST_CE_CT:
179 /* This state always preceeds the following states */
180 if (MMATCH("Content-T", CT_WS, IDLE)) {
181 switch (*bp) {
182 case 'r':
183 case 'R':
184 cks->mhdr_st = CK_MHDR_ST_CE;
185 break;
186 case 'y':
187 case 'Y':
188 cks->mhdr_st = CK_MHDR_ST_CT;
189 break;
190 default:
191 cks->mhdr_st = CK_MHDR_ST_IDLE;
192 return 0;
193 }
194 }
195 break;
196
197 case CK_MHDR_ST_CE:
198 MMATCH("ransfer-Encoding:", CE_WS, IDLE);
199 break;
200 case CK_MHDR_ST_CE_WS:
201 if (!span_ws(&bp, &bp_len))
202 return 0;
203 switch (*bp) {
204 case 'b':
205 case 'B':
206 cks->mhdr_st = CK_MHDR_ST_B64;
207 break;
208 case 'q':
209 case 'Q':
210 cks->mhdr_st = CK_MHDR_ST_QP;
211 break;
212 default:
213 cks->mhdr_st = CK_MHDR_ST_IDLE;
214 return 0;
215 }
216 break;
217 case CK_MHDR_ST_QP:
218 if (MMATCH("quoted-printable", IDLE, IDLE))
219 cks->mime_ce = DCC_CK_CE_QP;
220 break;
221 case CK_MHDR_ST_B64:
222 if (MMATCH("base64", IDLE, IDLE))
223 cks->mime_ce = DCC_CK_CE_B64;
224 break;
225
226 case CK_MHDR_ST_CT:
227 MMATCH("ype:", CT_WS, IDLE);
228 break;
229 case CK_MHDR_ST_CT_WS:
230 /* We have matched "Content-type:" */
231 if (!span_ws(&bp, &bp_len))
232 return 0;
233 switch (*bp) {
234 case 't':
235 case 'T':
236 cks->mhdr_st = CK_MHDR_ST_TEXT;
237 break;
238 case 'm':
239 case 'M':
240 /* do not nest too deeply */
241 if (in_hdrs
242 || cks->mime_nest < DIM(cks->mime_bnd)) {
243 cks->mhdr_st = CK_MHDR_ST_MULTIPART;
244 } else {
245 cks->mhdr_st = CK_MHDR_ST_TEXT;
246 cks->mhdr_st = CK_MHDR_ST_IDLE;
247 }
248 break;
249 default:
250 /* assume it is binary noise if it does
251 * not match "Content-type: [tTmM]" */
252 cks->mime_ct = DCC_CK_CT_BINARY;
253 cks->mhdr_st = CK_MHDR_ST_IDLE;
254 return 0;
255 }
256 break;
257 case CK_MHDR_ST_TEXT:
258 /* we are looking for "Text" in "Content-type: Text" */
259 if (MMATCH("text", HTML, IDLE))
260 cks->mime_ct = DCC_CK_CT_TEXT;
261 break;
262 case CK_MHDR_ST_HTML:
263 /* look for "Content-type: Text/html" */
264 if (MMATCH("/html", CSET_SKIP_PARAM, CSET_SKIP_PARAM))
265 cks->mime_ct = DCC_CK_CT_HTML;
266 break;
267 case CK_MHDR_ST_CSET_SKIP_PARAM:
268 /* Look for semicolon or whitespace preceding next
269 * parameter after "Content-type: Text/html" */
270 if (skip_param(&bp, &bp_len))
271 cks->mhdr_st = CK_MHDR_ST_CSET_SPAN_WS;
272 break;
273 case CK_MHDR_ST_CSET_SPAN_WS:
274 /* skip optional whitespace before next parameter */
275 if (span_ws(&bp, &bp_len))
276 cks->mhdr_st = CK_MHDR_ST_CSET;
277 break;
278 case CK_MHDR_ST_CSET:
279 /* have matched "Content-Type: text...;"
280 * and are looking for a "charset=" parameter */
281 MMATCH("charset=", CSET_ISO_8859, CSET_SKIP_PARAM);
282 break;
283 case CK_MHDR_ST_CSET_ISO_8859:
284 /* We have matched "Content-Type: text...charset="
285 * and are looking for "ISO-8859-*".
286 * Ignore leading '"' */
287 if (cks->mhdr_pos == 0
288 && bp_len > 0 && *bp == '"') {
289 ++bp;
290 --bp_len;
291 }
292 MMATCH("iso-8859-", CSET_ISO_X, IDLE);
293 break;
294 case CK_MHDR_ST_CSET_ISO_X:
295 for (;;) {
296 if (bp_len == 0)
297 return 0;
298 --bp_len;
299 c = *bp++;
300 if (c < '0' || c > '9') {
301 if ((c == '"' || c == ' ' || c == '\t'
302 || c == ';'
303 || c == '\r' || c == '\n')
304 && cks->mhdr_pos == 2)
305 cks->mime_cset = dcc_cset_2;
306 else
307 cks->mime_cset = dcc_cset_1;
308 cks->mhdr_st = CK_MHDR_ST_IDLE;
309 return 0;
310 }
311 cks->mhdr_pos = cks->mhdr_pos*10 + c - '0';
312 if (cks->mhdr_pos > 99) {
313 cks->mhdr_st = CK_MHDR_ST_IDLE;
314 return 0;
315 }
316 }
317 case CK_MHDR_ST_MULTIPART:
318 /* We are looking for "Content-type: Multipart"
319 * after having seen "Content-type: M".
320 * If it is not "ultipart", assume "essage" and that
321 * it is text. */
322 cks->mhdr_st = CK_MHDR_ST_TEXT;
323 MMATCH("multipart", BND_SKIP_PARAM, IDLE);
324 break;
325 case CK_MHDR_ST_BND_SKIP_PARAM:
326 /* Look for semicolon or whitespace preceding next
327 * parameter after "Content-type: M" */
328 if (skip_param(&bp, &bp_len))
329 cks->mhdr_st = CK_MHDR_ST_BND_SPAN_WS;
330 break;
331 case CK_MHDR_ST_BND_SPAN_WS:
332 /* skip optional whitespace before next parameter */
333 if (span_ws(&bp, &bp_len))
334 cks->mhdr_st = CK_MHDR_ST_BND;
335 break;
336 case CK_MHDR_ST_BND:
337 /* we have matched "Content-type: multipart"
338 * and are looking for the "boundary" parameter */
339 if (MMATCH("boundary=", BND_VALUE, BND_SKIP_PARAM)) {
340 if (in_hdrs) {
341 cks->mime_nest = 0;
342 /* allow missing initial blank line */
343 cks->mime_bnd_matches = 1;
344 }
345 bndp = &cks->mime_bnd[cks->mime_nest];
346 cks->flags &= ~DCC_CKS_MIME_QUOTED;
347 bndp->bnd[0] = '-';
348 bndp->bnd[1] = '-';
349 cks->mhdr_pos = 2;
350 }
351 break;
352 case CK_MHDR_ST_BND_VALUE:
353 /* collect the bounary string */
354 bndp = &cks->mime_bnd[cks->mime_nest];
355 /* this accepts a lot more than RFC 2046 allows,
356 * but spamware written by idiots doesn't comply */
357 for (;;) {
358 if (bp_len == 0)
359 return 0;
360 --bp_len;
361 c = *bp++;
362 if (c == '\n')
363 break;
364 if (c == '\r')
365 continue;
366 if ((c == ' ' || c == '\t' || c == ';')
367 && !(cks->flags & DCC_CKS_MIME_QUOTED))
368 break;
369 if (c == '"') {
370 cks->flags ^= DCC_CKS_MIME_QUOTED;
371 continue;
372 }
373 bndp->bnd[cks->mhdr_pos] = c;
374 if (++cks->mhdr_pos >= DCC_CK_BND_MAX) {
375 cks->mhdr_st = CK_MHDR_ST_IDLE;
376 return 0;
377 }
378 }
379 bndp->bnd_len = cks->mhdr_pos;
380 bndp->cmp_len = 0;
381 ++cks->mime_nest;
382 cks->mhdr_st = CK_MHDR_ST_IDLE;
383 break;
384 }
385 } while (bp_len != 0);
386 return 0;
387
388 #undef MMATCH
389 #undef MKSIP_WS
390 }
391
392
393
394 /* fetch bytes and convert from quoted-printable */
395 u_int /* output length */
396 dcc_ck_qp_decode(DCC_GOT_CKS *cks, const char **ibufp, u_int *ibuf_lenp,
397 char *obuf, u_int obuf_len)
398 {
399 # define GC(c) do {if (!ibuf_len) return result; \
400 --ibuf_len; (c) = *ibuf; ++ibuf;} while (0)
401 u_int ibuf_len, result;
402 const char *ibuf;
403 u_char c = 0;
404
405 if (obuf_len == 0)
406 return 0;
407 ibuf_len = *ibuf_lenp;
408 ibuf = *ibufp;
409 result = 0;
410 while (ibuf_len != 0) {
411 switch (cks->qp.state) {
412 case DCC_CK_QP_IDLE:
413 GC(c);
414 if (c != '=')
415 break;
416 cks->qp.state = DCC_CK_QP_EQ;
417 continue;
418
419 case DCC_CK_QP_EQ:
420 /* Consider first character after '=' */
421 GC(c);
422 cks->qp.x = c;
423 if (c == '\r') {
424 ;
425 } else if (c == '\n') {
426 /* delete "=\n" like "=\r\n"
427 * so that dccproc and dccm agree */
428 cks->qp.state = DCC_CK_QP_IDLE;
429 continue;
430 } else if (c >= '0' && c <= '9') {
431 cks->qp.n = c-'0';
432 } else if (c >= 'a' && c <= 'f') {
433 cks->qp.n = c-('a'-10);
434 } else if (c >= 'A' && c <= 'F') {
435 cks->qp.n = c-('A'-10);
436 } else {
437 cks->qp.state = DCC_CK_QP_FAIL1;
438 c = '=';
439 break;
440 }
441 cks->qp.state = DCC_CK_QP_1;
442 continue;
443
444 case DCC_CK_QP_1:
445 /* consider second character after '=' */
446 GC(c);
447 cks->qp.y = c;
448 if (cks->qp.x == '\r') {
449 if (c == '\n') {
450 /* delete soft line-break */
451 cks->qp.state = DCC_CK_QP_IDLE;
452 continue;
453 }
454 cks->qp.state = DCC_CK_QP_FAIL2;
455 c = '=';
456 break;
457 } else if (c >= '0' && c <= '9') {
458 c -= '0';
459 } else if (c >= 'a' && c <= 'f') {
460 c -= ('a'-10);
461 } else if (c >= 'A' && c <= 'F') {
462 c -= ('A'-10);
463 } else {
464 cks->qp.state = DCC_CK_QP_FAIL2;
465 c = '=';
466 break;
467 }
468 cks->qp.state = DCC_CK_QP_IDLE;
469 c = (cks->qp.n << 4) | c;
470 break;
471
472 case DCC_CK_QP_FAIL1:
473 /* output character after '=' of a 2-character
474 * sequence that was not quoted-printable after all */
475 cks->qp.state = DCC_CK_QP_IDLE;
476 c = cks->qp.x;
477 break;
478
479 case DCC_CK_QP_FAIL2:
480 /* output character after '=' of a 3-character
481 * sequence that was not quoted-printable after all */
482 cks->qp.state = DCC_CK_QP_FAIL3;
483 c = cks->qp.x;
484 break;
485
486 case DCC_CK_QP_FAIL3:
487 /* output third character of a 3-character
488 * sequence that was not quoted-printable after all */
489 cks->qp.state = DCC_CK_QP_IDLE;
490 c = cks->qp.y;
491 break;
492 }
493
494 *obuf++ = c;
495 if (++result >= obuf_len)
496 break;
497 }
498 *ibuf_lenp = ibuf_len;
499 *ibufp = ibuf;
500 return result;
501 #undef GC
502 }
503
504
505
506
507 #define B64B 0100 /* bad */
508 #define B64EQ 0101 /* '=' */
509 static u_char base64_decode[128] = {
510 B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x00 */
511 B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x08 */
512 B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x10 */
513 B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x18 */
514
515 B64B, B64B, B64B, B64B, B64B, B64B, B64B, B64B, /* 0x20 ! " # $ % & ' */
516 B64B, B64B, B64B, 62, B64B, B64B, B64B, 63, /* 0x28 ( ) * + , - . / */
517
518 52, 53, 54, 55, 56, 57, 58, 59, /* 0x30 0 1 2 3 4 5 6 7 */
519 60, 61, B64B, B64B, B64B, B64EQ,B64B, B64B, /* 0x38 8 9 : ; < = > ? */
520
521 B64B, 0, 1, 2, 3, 4, 5, 6, /* 0x40 @ A B C D E F G */
522 7, 8, 9, 10, 11, 12, 13, 14, /* 0x48 H I J K L M N O */
523
524 15, 16, 17, 18, 19, 20, 21, 22, /* 0x50 P Q R S T U V W */
525 23, 24, 25, B64B, B64B, B64B, B64B, B64B, /* 0x58 X Y Z [ \ ] ^ _ */
526
527 B64B, 26, 27, 28, 29, 30, 31, 32, /* 0x60 ` a b c d e f g */
528 33, 34, 35, 36, 37, 38, 39, 40, /* 0x68 h i j k l m n o */
529
530 41, 42, 43, 44, 45, 46, 47, 48, /* 0x70 p q r s t u v w */
531 49, 50, 51, B64B, B64B, B64B, B64B, B64B, /* 0x78 x y z { | } ~ del */
532 };
533
534 u_int /* output length */
535 dcc_ck_b64_decode(DCC_GOT_CKS *cks, const char **ibufp, u_int *ibuf_lenp,
536 char *obuf, u_int obuf_len)
537 {
538 u_char c;
539 const char *ibuf;
540 u_int ibuf_len, result;
541
542 if (obuf_len < 3)
543 return 0;
544 obuf_len -= 3;
545 ibuf_len = *ibuf_lenp;
546 ibuf = *ibufp;
547 result = 0;
548 while (ibuf_len != 0) {
549 --ibuf_len;
550 c = *ibuf++;
551 c = base64_decode[c];
552 if (c == B64B)
553 continue;
554
555 if (c == B64EQ) {
556 switch (cks->b64.quantum_cnt) {
557 case 2:
558 *obuf++ = cks->b64.quantum>>4;
559 ++result;
560 break;
561 case 3:
562 *obuf++ = cks->b64.quantum>>10;
563 *obuf++ = cks->b64.quantum>>2;
564 result += 2;
565 break;
566 }
567 cks->b64.quantum_cnt = 0;
568 if (result >= obuf_len)
569 break;
570 }
571
572 cks->b64.quantum = (cks->b64.quantum << 6) | c;
573 if (++cks->b64.quantum_cnt >= 4) {
574 cks->b64.quantum_cnt = 0;
575 *obuf++ = cks->b64.quantum>>16;
576 *obuf++ = cks->b64.quantum>>8;
577 *obuf++ = cks->b64.quantum;
578 result += 3;
579 if (result >= obuf_len)
580 break;
581 }
582 }
583 *ibuf_lenp = ibuf_len;
584 *ibufp = ibuf;
585 return result;
586 }
587
588
589
590 /* skip parts of URLs */
591 int
592 dcc_ck_url(DCC_URL_SKIP *url, char c, char **pbufp)
593 {
594 #define RET_C(s) return ((c<<DCC_CK_URL_SHIFT) | s)
595
596 /* Continue skipping a URL to its end.
597 * Assume the end is the next blank, comma, '>', or '\n'
598 * unless the URL is quoted. Then continue to the quote
599 * or until the length has become silly. */
600
601 /* convert ASCII upper to lower case */
602 if (c >= 'A' && c <= 'Z')
603 c -= 'A' - 'a';
604
605 switch (url->st) {
606 case DCC_URL_ST_IDLE:
607 if (c == 'h') {
608 /* start looking for 't' after 'h' in "http" */
609 url->flags = 0;
610 url->st = DCC_URL_ST_T1;
611 } else if (c == '=') {
612 /* look for the '=' in "href=" or "img src=" */
613 url->st = DCC_URL_ST_QUOTE;
614 }
615 RET_C(DCC_CK_URL_CHAR);
616
617 case DCC_URL_ST_QUOTE:
618 /* look for '"' or 'H' after "href=" or "img src= */
619 if (c == 'h') {
620 url->flags &= ~DCC_URL_QUOTES;
621 url->st = DCC_URL_ST_T1;
622 } else if (c == '"') {
623 url->flags |= DCC_URL_DQUOTED;
624 url->st = DCC_URL_ST_QH;
625 } else if (c == '\'') {
626 url->flags |= DCC_URL_SQUOTED;
627 url->st = DCC_URL_ST_QH;
628 } else {
629 url->st = DCC_URL_ST_IDLE;
630 }
631 RET_C(DCC_CK_URL_CHAR);
632
633 case DCC_URL_ST_QH:
634 /* seen quote; looking for start of URL */
635 if (c == 'h') {
636 url->st = DCC_URL_ST_T1;
637 } else {
638 url->st = DCC_URL_ST_IDLE;
639 }
640 RET_C(DCC_CK_URL_CHAR);
641
642 case DCC_URL_ST_T1:
643 if (c == 't')
644 url->st = DCC_URL_ST_T2;
645 else
646 url->st = DCC_URL_ST_IDLE;
647 RET_C(DCC_CK_URL_CHAR);
648
649 case DCC_URL_ST_T2:
650 if (c == 't')
651 url->st = DCC_URL_ST_P;
652 else
653 url->st = DCC_URL_ST_IDLE;
654 RET_C(DCC_CK_URL_CHAR);
655
656 case DCC_URL_ST_P:
657 if (c == 'p')
658 url->st = DCC_URL_ST_S;
659 else
660 url->st = DCC_URL_ST_IDLE;
661 RET_C(DCC_CK_URL_CHAR);
662
663 case DCC_URL_ST_S:
664 /* we are expecting the ':' or 's' after http */
665 if (c == 's')
666 url->st = DCC_URL_ST_COLON;
667 else if (c == ':')
668 url->st = DCC_URL_ST_SLASH1;
669 else
670 url->st = DCC_URL_ST_IDLE;
671 RET_C(DCC_CK_URL_CHAR);
672
673 case DCC_URL_ST_COLON:
674 /* we are expecting the ':' after http or https */
675 if (c == ':')
676 url->st = DCC_URL_ST_SLASH1;
677 else
678 url->st = DCC_URL_ST_IDLE;
679 RET_C(DCC_CK_URL_CHAR);
680
681 case DCC_URL_ST_SLASH1:
682 /* we are expecting the first '/' after http: */
683 if (c == '/')
684 url->st = DCC_URL_ST_SLASH2;
685 else
686 url->st = DCC_URL_ST_IDLE;
687 RET_C(DCC_CK_URL_CHAR);
688
689 case DCC_URL_ST_SLASH2:
690 /* we are expecting the second '/' after http:/" */
691 if (c != '/') {
692 url->st = DCC_URL_ST_IDLE;
693 RET_C(DCC_CK_URL_CHAR);
694 }
695 url->st = DCC_URL_ST_SLASH3_START;
696 RET_C(DCC_CK_URL_CK_LEN);
697
698 case DCC_URL_ST_SLASH3_START:
699 url->dot = 0;
700 url->start = *pbufp;
701 url->total = 0;
702 url->flags &= ~(DCC_URL_DEL_DOMAIN
703 | DCC_URL_PERCENT1 | DCC_URL_PERCENT2);
704 url->st = DCC_URL_ST_SLASH3;
705 /* fall into DCC_URL_ST_SLASH3 */
706 case DCC_URL_ST_SLASH3:
707 /* look for the end of the host name */
708 ++url->total;
709 again:
710 if (c == '.') {
711 /* keep only 1st and 2nd level domain names */
712 url->flags &= ~DCC_URL_DEL_DOMAIN;
713 if (!url->dot) {
714 /* do nothing at first '.' unless the name
715 * was too long */
716 if (*pbufp >= url->start+DCC_URL_MAX) {
717 *pbufp = url->start;
718 } else {
719 url->dot = *pbufp;
720 }
721 } else {
722 url->flags |= DCC_URL_DEL_DOMAIN;
723 }
724 RET_C(DCC_CK_URL_DOT);
725 }
726 /* delay deleting third level domains to not be
727 * fooled by a trailing dot */
728 if (url->flags & DCC_URL_DEL_DOMAIN) {
729 url->flags &= ~DCC_URL_DEL_DOMAIN;
730 memmove(url->start, url->dot,
731 *pbufp - url->dot);
732 *pbufp -= (url->dot - url->start);
733 url->dot = *pbufp;
734 }
735
736 if (c == '/') {
737 url->st = DCC_URL_ST_SKIP;
738 RET_C(DCC_CK_URL_HOST_END);
739 }
740 if (c == '"' && (url->flags & DCC_URL_DQUOTED)) {
741 url->st = DCC_URL_ST_IDLE;
742 RET_C(DCC_CK_URL_HOST_END);
743 }
744 if (c == '\'' && (url->flags & DCC_URL_SQUOTED)) {
745 url->st = DCC_URL_ST_IDLE;
746 RET_C(DCC_CK_URL_HOST_END);
747 }
748 if ((c == '<' || c == '>')
749 && (url->flags & DCC_URL_QUOTES) == 0) {
750 url->st = DCC_URL_ST_IDLE;
751 RET_C(DCC_CK_URL_HOST_END);
752 }
753 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
754 if (!(url->flags & DCC_URL_QUOTED)
755 || url->total > DCC_URL_FAILSAFE) {
756 url->st = DCC_URL_ST_IDLE;
757 RET_C(DCC_CK_URL_HOST_END);
758 }
759 /* whitespace in a URL hostname is at best username */
760 *pbufp = url->start;
761 url->st = DCC_URL_ST_SLASH3_START;
762 RET_C(DCC_CK_URL_HOST_RESET);
763 }
764 if (c == '@') {
765 /* ignore username and password */
766 *pbufp = url->start;
767 url->st = DCC_URL_ST_SLASH3_START;
768 RET_C(DCC_CK_URL_HOST_RESET);
769 }
770
771 if (c == '%') {
772 url->flags &= ~DCC_URL_PERCENT2;
773 url->flags |= DCC_URL_PERCENT1;
774 RET_C(DCC_CK_URL_SKIP);
775 }
776 if (url->flags & DCC_URL_PERCENT1) {
777 if (c >= '0' && c <= '9') {
778 c -= '0';
779 } else if (c >= 'a' && c <= 'f') {
780 c -= 'a'-10;
781 } else {
782 *pbufp = url->start;
783 url->st = DCC_URL_ST_SLASH3_START;
784 RET_C(DCC_CK_URL_HOST_RESET);
785 }
786 if (url->flags & DCC_URL_PERCENT2) {
787 url->flags &= ~(DCC_URL_PERCENT1
788 | DCC_URL_PERCENT2);
789 c |= url->percent;
790 if (c >= 'A' && c <= 'Z')
791 c -= 'A' - 'a';
792 goto again;
793 }
794 url->percent = c << 4;
795 url->flags |= DCC_URL_PERCENT2;
796 RET_C(DCC_CK_URL_SKIP);
797 }
798
799 if (*pbufp >= url->start+DCC_URL_MAX) {
800 /* long garbage is probably a username */
801 if (url->total > DCC_URL_FAILSAFE) {
802 url->st = DCC_URL_ST_IDLE;
803 RET_C(DCC_CK_URL_CHAR);
804 }
805 RET_C(DCC_CK_URL_SKIP);
806 }
807 RET_C(DCC_CK_URL_HOST);
808
809 case DCC_URL_ST_SKIP:
810 /* skip the rest of the URL */
811 ++url->total;
812 if (c == '"' || c == '\'') {
813 url->st = DCC_URL_ST_IDLE;
814 RET_C(DCC_CK_URL_SKIP);
815 }
816 if ((c == '>' || c == ' ' || c == '\t'
817 || c == '\n' || c == '\r')
818 && (!(url->flags & DCC_URL_QUOTES)
819 || url->total > DCC_URL_FAILSAFE)) {
820 url->total = 0;
821 url->st = DCC_URL_ST_IDLE;
822 RET_C(DCC_CK_URL_CHAR);
823 }
824 RET_C(DCC_CK_URL_SKIP);
825 }
826 RET_C(DCC_CK_URL_CHAR);
827
828 #undef RET_C
829 }