comparison srvrlib/db.c @ 0:c7f6b056b673

First import of vendor version
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 13:49:58 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c7f6b056b673
1 /* Distributed Checksum Clearinghouse
2 *
3 * server database functions
4 *
5 * Copyright (c) 2008 by Rhyolite Software, LLC
6 *
7 * This agreement is not applicable to any entity which sells anti-spam
8 * solutions to others or provides an anti-spam solution as part of a
9 * security solution sold to other entities, or to a private network
10 * which employs the DCC or uses data provided by operation of the DCC
11 * but does not provide corresponding data to other users.
12 *
13 * Permission to use, copy, modify, and distribute this software without
14 * changes for any purpose with or without fee is hereby granted, provided
15 * that the above copyright notice and this permission notice appear in all
16 * copies and any distributed versions or copies are either unchanged
17 * or not called anything similar to "DCC" or "Distributed Checksum
18 * Clearinghouse".
19 *
20 * Parties not eligible to receive a license under this agreement can
21 * obtain a commercial license to use DCC by contacting Rhyolite Software
22 * at sales@rhyolite.com.
23 *
24 * A commercial license would be for Distributed Checksum and Reputation
25 * Clearinghouse software. That software includes additional features. This
26 * free license for Distributed ChecksumClearinghouse Software does not in any
27 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
28 * software
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
31 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
33 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
34 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
35 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
36 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
37 * SOFTWARE.
38 *
39 * Rhyolite Software DCC 1.3.103-1.214 $Revision$
40 */
41
42 #include "srvr_defs.h"
43 #include <syslog.h>
44 #include <sys/resource.h>
45 #if defined(HAVE_HW_PHYSMEM) || defined(HAVE_BOOTTIME)
46 #include <sys/sysctl.h>
47 #endif
48 #ifdef HAVE_PSTAT_GETSTATIC /* HP-UX */
49 #include <sys/pstat.h>
50 #endif
51
52 DB_STATS db_stats;
53
54 DB_STATES db_sts;
55
56 DCC_PATH db_path_buf;
57
58 int db_fd = -1;
59 DCC_PATH db_nm;
60 int db_hash_fd = -1;
61 DCC_PATH db_hash_nm;
62 struct timeval db_locked; /* 1=database not locked */
63
64 struct timeval db_time;
65
66 int db_debug;
67
68 u_char grey_on;
69 static u_char db_use_write; /* 0=no 1=if RAM big enough 2=always */
70 static u_char db_dirty;
71 static u_char db_rdonly;
72 int db_failed_line; /* bad happened at this line # */
73 const char *db_failed_file; /* in this file */
74 static u_char db_invalidate; /* do not write to the files */
75
76 /* Without mmap(MAP_NOSYNC) as on Solaris or a good msync() as on BSD/OS,
77 * we must rely on the kernel's update/syncer/bufdaemon/etc. So in this
78 * case just fondle the mmap()'ed pages and hope things work out.
79 *
80 * With a msync() and with mmap(MAP_NOSYNC), use MAP_NOSYNC if we can because
81 * some systems flush too quickly while others such as FreeBSD 6.1 stall
82 * for seconds while thinking about flushing the database.
83 * But with mmap(MAP_NOSYNC) we leave large amounts of data in RAM that take
84 * too long time to be pushed to the disk when the system is shutting down.
85 * So
86 * - hit only those chunks of memory with real data or changes to data
87 * with msync(). Trust dbclean to rebuild everything else at need.
88 *
89 * - when it seems the system is being shut down, delete the hash table
90 * and let it be rebuilt when the system is rebooted. When the
91 * hash table is rebuilt, "obsolete" markings in the data file that
92 * might have been lost will be remade.
93 *
94 * A third case involves dccd -F. It requires that all changes be pushed to
95 * the disk whenever dccd unlocks the database so that dbclean can see changes
96 * dccd makes. It also requires that dbclean write all of its changes so
97 * that dccd will find them when it reopens the database.
98 */
99
100 #if !defined(MAP_NOSYNC) || defined(HAVE_OLD_MSYNC) || !defined(HAVE_BOOTTIME)
101 #undef USE_MAP_NOSYNC
102 #else
103 #define USE_MAP_NOSYNC
104 #endif
105
106 static u_char db_not_synced; /* database unsynchronized with disk */
107
108
109 #define DCC_MADV_WILLNEED(p) 0
110 #ifdef MADV_WILLNEED
111 #undef DCC_MADV_WILLNEED
112 #define DCC_MADV_WILLNEED(p) madvise(p, db_pagesize, MADV_WILLNEED)
113 #endif
114 #ifdef POSIX_MADV_WILLNEED
115 #undef DCC_MADV_WILLNEED
116 #define DCC_MADV_WILLNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_WILLNEED)
117 #endif
118
119 #define DCC_MADV_RANDOM(p) 0
120 #ifdef MADV_RANDOM
121 #undef DCC_MADV_RANDOM
122 #define DCC_MADV_RANDOM(p) madvise(p, db_pagesize, MADV_RANDOM)
123 #endif
124 #ifdef POSIX_MADV_RANDOM
125 #undef DCC_MADV_RANDOM
126 #define DCC_MADV_RANDOM(p) posix_madvise(p, db_pagesize, POSIX_MADV_RANDOM)
127 #endif
128
129 #define DCC_MADV_DONTNEED(p) 0
130 /* The Linux people claim that it is just fine that their notion of
131 * MADV_DONTNEED implies discarding changes to data. Worse, some versions of
132 * Linux/GNU libc define POSIX_MADV_DONTNEED as the data-corrupting Linux
133 * MADV_DONTNEED. This seems to be because they cannot admit their mistake of
134 * not distinguishing between the functions of MADV_FREE and MADV_DONTNEED and
135 * their misreading of other systems' documentation for MADV_DONTNEED */
136 #ifndef linux
137 #ifdef MADV_DONTNEED
138 #undef DCC_MADV_DONTNEED
139 #define DCC_MADV_DONTNEED(p) madvise(p, db_pagesize, MADV_DONTNEED)
140 #endif
141 #ifdef POSIX_MADV_DONTNEED
142 #undef DCC_MADV_DONTNEED
143 #define DCC_MADV_DONTNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_DONTNEED)
144 #endif
145 #endif /* !linux */
146
147 #define DCC_MADV_FREE(p) 0
148 #ifdef MADV_FREE
149 #undef DCC_MADV_FREE
150 #define DCC_MADV_FREE(p) madvise(p, db_pagesize, MADV_FREE)
151 #endif
152 #ifdef POSIX_MADV_FREE
153 #undef DCC_MADV_FREE
154 #define DCC_MADV_FREE(p) posix_madvise(p, db_pagesize, POSIX_MADV_FREE)
155 #endif
156
157
158 u_char db_minimum_map; /* this is dccd & dbclean is running */
159
160 int db_buf_total; /* total # of db buffers */
161 DB_PTR db_max_rss; /* maximum db resident set size */
162 DB_PTR db_max_byte; /* maximum db bytes in both files */
163
164 static u_int system_pagesize; /* kernel page size */
165
166 static DB_BUF db_bufs[DB_BUF_MAX]; /* control mmap()'ed blocks */
167 static DB_BUF *buf_oldest, *buf_newest;
168
169 #define DB_HASH_TOTAL DB_BUF_MAX
170 static DB_BUF *db_buf_hash[DB_HASH_TOTAL];
171 /* fancy 16-bit multiplicative hash assumes multiplication needs 1 cycle
172 * and so the hash is faster than dealing with a collision */
173 #define DB_BUF_HASH(pnum,t) (&db_buf_hash[((((pnum)*(t)*0x9ccf) & 0xffff) \
174 * DB_BUF_MAX) >> 16])
175
176 time_t db_need_flush_secs;
177 static time_t db_urgent_need_flush_secs;
178
179 const DB_VERSION_BUF db_version_buf = DB_VERSION_STR;
180 DB_PARMS db_parms;
181 static DB_PARMS db_parms_stored;
182
183 DCC_TGTS db_tholds[DCC_DIM_CKS];
184
185 u_int db_pagesize; /* size of 1 mmap()'ed buffer */
186 static u_int db_pagesize_part;
187
188 DB_HOFF db_hash_fsize; /* size of hash table file */
189 static u_int hash_clear_pg_num;
190 DB_HADDR db_hash_len; /* # of hash table entries */
191 DB_HADDR db_hash_divisor; /* modulus */
192 DB_HADDR db_hash_used; /* # of hash table entries in use */
193 u_int db_hash_page_len; /* # of HASH_ENTRY's per buffer */
194 DB_HADDR db_max_hash_entries = 0; /* after db_buf_init()*/
195 DB_PTR db_fsize; /* size of database file */
196 DB_PTR db_csize; /* size of database contents in bytes */
197 static DB_PTR db_csize_stored_hash; /* DB size stored in hash file */
198 static DB_HADDR db_hash_used_stored_hash;
199 u_int db_page_max; /* only padding after this in DB buf */
200 static DB_PTR db_window_size; /* size of mmap() window */
201 char db_window_size_str[128];
202 static char db_physmem_str[80];
203
204 static const u_char dcc_ck_fuzziness[DCC_DIM_CKS] = {
205 0, /* DCC_CK_INVALID */
206 DCC_CK_FUZ_LVL_NO, /* DCC_CK_IP */
207 DCC_CK_FUZ_LVL_NO, /* DCC_CK_ENV_FROM */
208 DCC_CK_FUZ_LVL_NO, /* DCC_CK_FROM */
209 DCC_CK_FUZ_LVL_NO, /* DCC_CK_SUB */
210 DCC_CK_FUZ_LVL_NO, /* DCC_CK_MESSAGE_ID */
211 DCC_CK_FUZ_LVL_NO, /* DCC_CK_RECEIVED */
212 DCC_CK_FUZ_LVL_NO, /* DCC_CK_BODY */
213 DCC_CK_FUZ_LVL1, /* DCC_CK_FUZ1 */
214 DCC_CK_FUZ_LVL2, /* DCC_CK_FUZ2 */
215 DCC_CK_FUZ_LVL_REP, /* DCC_CK_REP_TOTAL */
216 DCC_CK_FUZ_LVL_REP, /* DCC_CK_REP_BULK */
217 DCC_CK_FUZ_LVL2, /* DCC_CK_SRVR_ID */
218 DCC_CK_FUZ_LVL2 /* DCC_CK_ENV_TO */
219 };
220 static const u_char grey_ck_fuzziness[DCC_DIM_CKS] = {
221 0, /* DCC_CK_INVALID */
222 DCC_CK_FUZ_LVL2, /* DCC_CK_IP */
223 DCC_CK_FUZ_LVL_NO, /* DCC_CK_ENV_FROM */
224 DCC_CK_FUZ_LVL_NO, /* DCC_CK_FROM */
225 DCC_CK_FUZ_LVL_NO, /* DCC_CK_SUB */
226 DCC_CK_FUZ_LVL_NO, /* DCC_CK_MESSAGE_ID */
227 DCC_CK_FUZ_LVL_NO, /* DCC_CK_RECEIVED */
228 DCC_CK_FUZ_LVL_NO, /* DCC_CK_BODY */
229 DCC_CK_FUZ_LVL_NO, /* DCC_CK_FUZ1 */
230 DCC_CK_FUZ_LVL_NO, /* DCC_CK_FUZ2 */
231 DCC_CK_FUZ_LVL_NO, /* DCC_CK_GREY_MSG */
232 DCC_CK_FUZ_LVL1, /* DCC_CK_GREY_TRIPLE */
233 DCC_CK_FUZ_LVL1, /* DCC_CK_SRVR_ID */
234 DCC_CK_FUZ_LVL1 /* DCC_CK_ENV_TO */
235 };
236 const u_char *db_ck_fuzziness = dcc_ck_fuzziness;
237
238
239 static u_char buf_flush(DCC_EMSG, DB_BUF *, u_char);
240 static u_char buf_munmap(DCC_EMSG, DB_BUF *);
241 static DB_BUF *find_buf(DCC_EMSG, DB_BUF_TYPE, DB_PG_NUM);
242 static u_char map_hash(DCC_EMSG, DB_HADDR, DB_STATE *, u_char);
243 static u_char map_hash_ctl(DCC_EMSG, u_char);
244 static u_char map_db(DCC_EMSG, DB_PTR, u_int, DB_STATE *, u_char);
245 static u_char db_set_sizes(DCC_EMSG);
246
247
248 /* compute the least common multiple of two numbers */
249 static u_int
250 lcm(u_int n, u_int m)
251 {
252 u_int r, x, gcd;
253
254 /* first get the gcd of the two numbers */
255 if (n >= m) {
256 x = n;
257 gcd = m;
258 } else {
259 x = m;
260 gcd = n;
261 }
262 for (;;) {
263 r = x % gcd;
264 if (r == 0)
265 return n * (m / gcd);
266 x = gcd;
267 gcd = r;
268 }
269 }
270
271
272
273 const char *
274 db_ptr2str(DB_PTR val)
275 {
276 static int bufno;
277 static struct {
278 char str[16];
279 } bufs[4];
280 char *s;
281 const char *units;
282
283 if (val == 0)
284 return "0";
285
286 s = bufs[bufno].str;
287 bufno = (bufno+1) % DIM(bufs);
288
289 if (val % (1024*1024*1024) == 0) {
290 val /= (1024*1024*1024);
291 units = "GB";
292 } else if (val % (1024*1024) == 0) {
293 val /= (1024*1024);
294 units = "MB";
295 } else if (val % 1024 == 0) {
296 val /= 1024;
297 units = "KB";
298 } else {
299 units = "";
300 }
301 if (val > 1000*1000*1000)
302 snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d,%03d%s",
303 (int)(val / (1000*1000*1000)),
304 (int)(val / (1000*1000)) % 1000,
305 (int)(val / 1000) % 1000,
306 (int)(val % 1000),
307 units);
308 else if (val > 1000*1000)
309 snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d%s",
310 (int)(val / (1000*1000)),
311 (int)(val / 1000) % 1000,
312 (int)(val % 1000),
313 units);
314 else if (val > 1000*10)
315 snprintf(s, sizeof(bufs[0].str), "%d,%03d%s",
316 (int)(val / 1000),
317 (int)(val % 1000),
318 units);
319 else
320 snprintf(s, sizeof(bufs[0].str), "%d%s",
321 (int)val,
322 units);
323 return s;
324 }
325
326
327
328 const char *
329 size2str(char *buf, u_int buf_len,
330 double num, u_char bytes_or_entries) /* 0=number 1=bytes */
331 {
332 const char *units;
333 double k;
334
335 k = bytes_or_entries ? 1024.0 : 1000.0;
336
337 if (num < k) {
338 units = "";
339 } else if (num < k*k) {
340 num /= k;
341 units = "K";
342 } else if (num < k*k*k) {
343 num /= k*k;
344 units = "M";
345 } else {
346 num /= k*k*k;
347 units = "G";
348 }
349
350 if ((int)num >= 100)
351 snprintf(buf, buf_len, "%.0f%s", num, units);
352 else
353 snprintf(buf, buf_len, "%.2g%s", num, units);
354 return buf;
355 }
356
357
358
359 void PATTRIB(5,6)
360 db_failure(int linenum, const char *file, int ex_code, DCC_EMSG emsg,
361 const char *p, ...)
362 {
363 va_list args;
364
365 if (!db_failed_line) {
366 db_failed_line = linenum;
367 db_failed_file = file;
368 }
369 va_start(args, p);
370 dcc_vpemsg(ex_code, emsg, p, args);
371 va_end(args);
372 }
373
374
375
376 void PATTRIB(3,4)
377 db_error_msg(int linenum, const char *file, const char *p, ...)
378 {
379 va_list args;
380
381 if (!db_failed_line) {
382 db_failed_line = linenum;
383 db_failed_file = file;
384 }
385 va_start(args, p);
386 dcc_verror_msg(p, args);
387 va_end(args);
388 }
389
390
391
392 double /* hashes or bytes/second */
393 db_add_rate(const DB_PARMS *parms,
394 u_char hash_or_db) /* 1=hash */
395 {
396 struct timeval sn;
397 time_t new_rate_secs;
398 time_t total_secs;
399 double added, cur, prev;
400
401 total_secs = parms->rate_secs;
402 if (hash_or_db) {
403 added = parms->hash_added;
404 cur = parms->hash_used;
405 prev = parms->old_hash_used;
406 } else {
407 added = parms->db_added;
408 cur = parms->db_csize;
409 prev = parms->old_db_csize;
410 }
411
412 if (total_secs <= 0 || total_secs > DB_MAX_RATE_SECS
413 || added <= 0.0) {
414 added = 0.0;
415 total_secs = 0;
416 }
417
418 dcc_ts2timeval(&sn, &parms->sn);
419 new_rate_secs = parms->last_rate_sec - sn.tv_sec;
420 if (new_rate_secs > 0 && new_rate_secs <= DB_MAX_RATE_SECS
421 && cur > prev) {
422 total_secs += new_rate_secs;
423 added += cur - prev;
424 }
425
426 if (total_secs <= DB_MIN_RATE_SECS)
427 return -1.0;
428 return added / total_secs;
429 }
430
431
432
433 DB_NOKEEP_CKS
434 def_nokeep_cks(void)
435 {
436 DCC_CK_TYPES type;
437 DB_NOKEEP_CKS nokeep = 0;
438
439 for (type = DCC_CK_TYPE_FIRST; type <= DCC_CK_TYPE_LAST; ++type) {
440 if (DB_GLOBAL_NOKEEP(grey_on, type))
441 DB_SET_NOKEEP(nokeep, type);
442 }
443 DB_SET_NOKEEP(nokeep, DCC_CK_INVALID);
444 DB_SET_NOKEEP(nokeep, DCC_CK_FLOD_PATH);
445
446 return nokeep;
447 }
448
449
450
451 void
452 set_db_tholds(DB_NOKEEP_CKS nokeep)
453 {
454 DCC_CK_TYPES type;
455
456 for (type = 0; type < DIM(db_tholds); ++type) {
457 db_tholds[type] = (DB_TEST_NOKEEP(nokeep, type)
458 ? DCC_TGTS_INVALID
459 : DCC_CK_IS_REP_CMN(grey_on, type)
460 ? DCC_TGTS_INVALID
461 : grey_on ? 1
462 : type == DCC_CK_SRVR_ID ? 1
463 : BULK_THRESHOLD);
464 }
465 }
466
467
468
469 static const char *
470 buf2path(const DB_BUF *b)
471 {
472 switch (b->buf_type) {
473 case DB_BUF_TYPE_HASH:
474 return db_hash_nm;
475 case DB_BUF_TYPE_DB:
476 return db_nm;
477 case DB_BUF_TYPE_FREE:
478 default:
479 dcc_logbad(EX_SOFTWARE, "impossible buffer type for a path");
480 }
481 }
482
483
484
485 static int
486 buf2fd(const DB_BUF *b)
487 {
488 switch (b->buf_type) {
489 case DB_BUF_TYPE_HASH:
490 return db_hash_fd;
491 case DB_BUF_TYPE_DB:
492 return db_fd;
493 case DB_BUF_TYPE_FREE:
494 default:
495 dcc_logbad(EX_SOFTWARE, "impossible buffer type for fd");
496 }
497 }
498
499
500
501 static void
502 rel_db_state(DB_STATE *st)
503 {
504 DB_BUF *b;
505
506 b = st->b;
507 if (!b)
508 return;
509 st->b = 0;
510 st->d.v = 0;
511 st->s.rptr = DB_PTR_BAD;
512 if (--b->lock_cnt < 0)
513 dcc_logbad(EX_SOFTWARE,"negative database buffer lock");
514 }
515
516
517
518 void
519 rel_db_states(void)
520 {
521 DB_STATE *st;
522
523 for (st = &db_sts.rcd; st <= &db_sts.hash_ctl; ++st) {
524 rel_db_state(st);
525 }
526 }
527
528
529
530 /* release one or all unneeded buffers */
531 u_char /* 0=problem 1=did nothing 2=did>=1 */
532 db_unload(DCC_EMSG emsg,
533 u_char some) /* 0=all, 1=only one, 2=finished */
534 {
535 DB_BUF *b;
536 u_char result;
537
538 result = 1;
539 for (b = buf_oldest; b != 0; b = b->newer) {
540 if (b->buf_type == DB_BUF_TYPE_FREE
541 || b->lock_cnt != 0)
542 continue;
543 if (some == 2
544 && !(b->flags & DB_BUF_FG_USE_WRITE)
545 && 0 > DCC_MADV_DONTNEED(b->buf.v))
546 dcc_error_msg("madvise(DONTNEED %s,%#x): %s",
547 buf2path(b), db_pagesize, ERROR_STR());
548 if (!buf_munmap(emsg, b)) {
549 emsg = 0;
550 result = 0;
551 } else if (result) {
552 result = 2;
553 }
554 if (some == 1)
555 return result;
556 }
557
558 return result;
559 }
560
561
562
563 static u_char
564 buf_write_part(DCC_EMSG emsg, DB_BUF *b, off_t offset, void *buf, int len)
565 {
566 int i;
567
568 offset += (off_t)b->pg_num * (off_t)db_pagesize;
569
570 if (offset != lseek(buf2fd(b), offset, SEEK_SET)) {
571 db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
572 "buf_write_part lseek(%s,"OFF_HPAT"): %s",
573 buf2path(b), offset, ERROR_STR());
574 return 0;
575 }
576 i = write(buf2fd(b), buf, len);
577 if (i != len) {
578 db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
579 "buf_write_part(%s,%u)=%d: %s",
580 buf2path(b), len, i, ERROR_STR());
581 return 0;
582 }
583
584 return 1;
585 }
586
587
588
589 /* push part of a buffer toward the disk
590 * this can be needed even when the file has been opened and mapped
591 * read-only by dbclean */
592 static u_char
593 buf_flush_part(DCC_EMSG emsg, DB_BUF *b,
594 u_int part, /* DB_BUF_NUM_PARTS=buffer */
595 u_char async UATTRIB)
596 {
597 u_int flush_len;
598 char *flush_base;
599 DB_BUF_FM bit;
600
601 bit = PART2BIT(part) & (b->flush | b->flush_urgent);
602 if (!bit)
603 return 1;
604
605 /* Send a new buffer to disk at once. */
606 if (b->flags & DB_BUF_FG_EXTENSION) {
607 DB_BUF *b1, *b0;
608 u_char result;
609
610 /* To give the file system a chance to make the hash table
611 * contiguous, first write all preceding new buffers.
612 * In almost all cases, there will be none. */
613 result = 1;
614 do {
615 b0 = b;
616 for (b1 = buf_oldest; b1 != 0; b1 = b1->newer) {
617 if (!(b1->flags & DB_BUF_FG_EXTENSION)
618 || b1->buf_type != b0->buf_type
619 || b1->pg_num >= b0->pg_num)
620 continue;
621 b0 = b1;
622 }
623 b0->flags &= ~DB_BUF_FG_EXTENSION;
624 b0->flush = 0;
625 b0->flush_urgent = 0;
626 if (!db_invalidate
627 && !buf_write_part(emsg, b0,
628 0, b0->buf.c, db_pagesize))
629 result = 0;
630 } while (b0 != b);
631 return result;
632 }
633
634 flush_base = b->ranges[part].lo;
635 flush_len = b->ranges[part].hi - flush_base;
636 b->flush &= ~bit;
637 b->flush_urgent &= ~bit;
638
639 if (db_invalidate)
640 return 1;
641
642 if (b->flags & DB_BUF_FG_USE_WRITE) {
643 static char *wbuf;
644 static u_int wbuf_len;
645
646 /* In at least FreeBSD you cannot write() to the file
647 * that underlies a mmap() region from that region */
648 if (wbuf_len < db_pagesize_part) {
649 /* the page size for the current file
650 * might be different from the old file */
651 if (wbuf)
652 free(wbuf);
653 wbuf_len = db_pagesize_part;
654 wbuf = malloc(wbuf_len);
655 }
656
657 memcpy(wbuf, flush_base, flush_len);
658 return buf_write_part(emsg, b, flush_base - b->buf.c,
659 wbuf, flush_len);
660
661 #ifndef HAVE_OLD_MSYNC
662 } else if (async) {
663 if (0 > MSYNC(flush_base, flush_len, MS_ASYNC)) {
664 db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
665 "msync(db buffer %s,%#lx,%#x,MS_ASYNC): %s",
666 buf2path(b), (long)flush_base, flush_len,
667 ERROR_STR());
668 return 0;
669 }
670 #endif
671 } else {
672 if (0 > MSYNC(flush_base, flush_len, MS_SYNC)) {
673 db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
674 "msync(db buffer %s,%#lx,%#x,MS_SYNC): %s",
675 buf2path(b), (long)flush_base, flush_len,
676 ERROR_STR());
677 return 0;
678 }
679 }
680
681 return 1;
682 }
683
684
685
686 static u_char
687 buf_flush(DCC_EMSG emsg, DB_BUF *b, u_char async)
688 {
689 u_int part;
690 DB_BUF_FM bits;
691 u_char result = 1;
692
693 bits = b->flush_urgent | b->flush;
694 for (part = 0; bits != 0 && part < DB_BUF_NUM_PARTS; ++part) {
695 if (bits & PART2BIT(part)) {
696 if (!buf_flush_part(emsg, b, part, async)) {
697 emsg = 0;
698 result = 0;
699 }
700 bits = b->flush_urgent | b->flush;
701 }
702 }
703 return result;
704 }
705
706
707
708 /* Try to keep the data clean so that the fsync() required by Solaris
709 * when the file is unloaded is not too expensive.
710 * Try to flush frequently so that we don't stall as long in msync().
711 */
712 void
713 db_flush_needed(void)
714 {
715 static DB_BUF *next_b = db_bufs;
716 static u_int next_part;
717 DB_BUF *b;
718 u_int part, all_parts;
719 int buf_num;
720 u_char worked;
721
722 /* send to the disk changes that cannot be recreated by dbclean */
723 if (db_urgent_need_flush_secs != 0
724 && DB_IS_TIME(db_urgent_need_flush_secs,
725 DB_URGENT_NEED_FLUSH_SECS)) {
726 worked = 0;
727 for (b = buf_newest; b; b = b->older) {
728 if (b->buf_type == DB_BUF_TYPE_FREE)
729 continue;
730
731 for (part = 0;
732 b->flush_urgent != 0 && part < DB_BUF_NUM_PARTS;
733 ++part) {
734 if ((b->flush_urgent & PART2BIT(part))) {
735 buf_flush_part(0, b, part, 1);
736 worked = 1;
737 }
738 }
739
740 /* Switch new data pages to mmap()
741 * when this is not dbclean, since only dccd calls here
742 * they are not using mmap()
743 * they are either hash table pages or
744 * not the last page in the file */
745 if ((b->flags & DB_BUF_FG_USE_WRITE)
746 && !db_use_write
747 && (b->buf_type != DB_BUF_TYPE_DB
748 || (DB_PTR2PG_NUM(db_csize-1, db_pagesize)
749 != b->pg_num))) {
750 if (b->lock_cnt != 0)
751 rel_db_states();
752 buf_munmap(0, b);
753 }
754 }
755
756 /* Keep the clock running if we did any work. This tends to
757 * avoid stalls caused by colliding with the FreeBSD syncer */
758 if (worked) {
759 gettimeofday(&db_time, 0);
760 db_urgent_need_flush_secs = (db_time.tv_sec
761 + DB_URGENT_NEED_FLUSH_SECS);
762 } else {
763 db_urgent_need_flush_secs = 0;
764 }
765 }
766
767 /* assume there will be nothing more to do */
768 db_need_flush_secs = db_urgent_need_flush_secs;
769
770 #ifdef USE_MAP_NOSYNC
771 /* if we are using mmap(MAP_NOSYNC), then there are no bits
772 * set in any b->flush words except that of the recent
773 * DB_BUF_FG_USE_WRITE extensions of the file. It is best to let
774 * those blocks stay in RAM until the whole buffer is flushed and
775 * switched to mmap above */
776 if (!db_use_write)
777 return;
778 #endif
779
780 b = next_b;
781 part = next_part;
782 all_parts = DB_PARTS_PER_FLUSH;
783 for (buf_num = DIM(db_bufs); buf_num >= 0; --buf_num) {
784 if (b > LAST(db_bufs)) {
785 part = 0;
786 b = db_bufs;
787 }
788 if (!b->flush
789 || part >= DB_BUF_NUM_PARTS
790 || b->buf_type == DB_BUF_TYPE_FREE) {
791 part = 0;
792 ++b;
793 continue;
794 }
795
796 while (part < DB_BUF_NUM_PARTS) {
797 if (b->flush & PART2BIT(part)) {
798 buf_flush_part(0, b, part, 1);
799 if (--all_parts == 0) {
800 next_part = part+1;
801 next_b = b;
802 db_need_flush_secs = (db_time.tv_sec
803 + DB_NEED_FLUSH_SECS);
804 return;
805 }
806 if (!b->flush)
807 part = DB_BUF_NUM_PARTS;
808 }
809 ++part;
810 }
811 }
812 }
813
814
815
816 /* occassionally flush an unlocked data buffer for dbclean
817 * dbclean mostly changes only the current record, so get started
818 * writing the data to avoid stalling the system at the end. */
819 u_char
820 db_flush_db(DCC_EMSG emsg UATTRIB)
821 {
822 #ifdef USE_MAP_NOSYNC
823 DB_BUF *b;
824 int limit;
825 int pg_num;
826
827 /* Gently push the new hash table to disk.
828 * The disk image will never be accurate. This only allocates space.
829 * Do not do this for systems that lack mmap(NOSYNC) such as Linux
830 * that thrash themselves as the hash table is being built. A
831 * long pause when the database is closed is not as bad as spending
832 * hours building the hash table. */
833 while (hash_clear_pg_num < db_hash_fsize/db_hash_page_len) {
834 pg_num = hash_clear_pg_num++;
835 for (b = buf_oldest; b != 0; b = b->newer) {
836 if (b->pg_num != pg_num
837 || b->buf_type != DB_BUF_TYPE_HASH)
838 continue;
839 if (!(b->flags & DB_BUF_FG_EXTENSION))
840 break;
841 if (b->lock_cnt != 0)
842 rel_db_states();
843 return buf_munmap(emsg, b);
844 }
845
846 /* look for the next page if this one has already
847 * been flushed */
848 }
849
850 /* flush some ordinary buffers */
851 limit = 2;
852 for (b = buf_oldest; b != 0; b = b->newer) {
853 if (b->flush_urgent == 0
854 || b->buf_type == DB_BUF_TYPE_FREE
855 || b->lock_cnt != 0)
856 continue;
857 if (!buf_flush(emsg, b, 1))
858 return 0;
859 if (--limit <= 0)
860 return 1;
861 }
862 #endif
863 return 1;
864 }
865
866
867
868 /* mark part of a buffer dirty
869 * "Urgent" changes are flushed by a timer. Ordinary changes
870 * are often ignored and expected to be rebuilt if the system crashes.
871 * That the hash table is deleted as the system is shut down while the
872 * database must be flushed from the system's buffer cache is a reason
873 * to keep the disk image of the database good. */
874 void
875 db_set_flush(DB_STATE *st, u_char urgent, u_int len)
876 {
877 DB_BUF *b;
878 DB_BUF_FM bit, new_bits, old_bits;
879 char *buf_base, *part_end, *start, *end;
880 u_int part, i;
881
882 /* nothing to do if the kernel is handling it
883 * or if we are letting this change be reconstructed by dbclean */
884 b = st->b;
885 if (!(b->flags & DB_BUF_FG_USE_WRITE)) {
886 #ifdef USE_MAP_NOSYNC
887 if (!urgent)
888 #endif
889 return;
890 }
891
892 start = st->d.c;
893 buf_base = b->buf.c;
894
895 /* Increase to even pages in the hope that the file system might
896 * be able to page-flip. This might at least avoid reading into the
897 * buffer cache to honor a write(). Besides, Solaris' msync() handles
898 * only even pages. */
899 i = (start - buf_base) % system_pagesize;
900 start -= i;
901 len += i;
902 len = ((len + system_pagesize-1) / system_pagesize) * system_pagesize;
903
904 end = start + len;
905 if (end > buf_base+db_pagesize)
906 dcc_logbad(EX_SOFTWARE, "inflated dirty buffer size");
907
908 part = (start - buf_base) / db_pagesize_part;
909 part_end = buf_base + part * db_pagesize_part;
910 bit = PART2BIT(part);
911 new_bits = 0;
912 old_bits = b->flush | b->flush_urgent;
913 do {
914 part_end += db_pagesize_part;
915 if (part_end > end)
916 part_end = end;
917
918 if (!(old_bits & bit)) {
919 b->ranges[part].lo = start;
920 b->ranges[part].hi = part_end;
921 } else {
922 if (b->ranges[part].lo > start)
923 b->ranges[part].lo = start;
924 if (b->ranges[part].hi < part_end)
925 b->ranges[part].hi = part_end;
926 }
927 new_bits |= bit;
928
929 start = part_end;
930 bit <<= 1;
931 ++part;
932 } while (part_end < end);
933
934 if (urgent) {
935 b->flush_urgent |= new_bits;
936 if (!db_urgent_need_flush_secs) {
937 db_urgent_need_flush_secs = (db_time.tv_sec
938 + DB_URGENT_NEED_FLUSH_SECS);
939 if (db_need_flush_secs == 0)
940 db_need_flush_secs = db_urgent_need_flush_secs;
941 }
942 } else {
943 b->flush |= new_bits;
944 if (db_need_flush_secs == 0
945 || db_need_flush_secs > db_time.tv_sec+DB_NEED_FLUSH_SECS)
946 db_need_flush_secs = db_time.tv_sec+DB_NEED_FLUSH_SECS;
947 }
948 }
949
950
951
952 /* Shut down the database, including flushing and releasing all
953 * mmap()'ed buffers
954 * Do nothing to the files for mode=-1 because the file is new and garbage
955 * or the caller is a fork of the server shedding memory. */
956 u_char
957 db_close(int mode) /* -1=invalidate, 0=dirty, 1=clean */
958 {
959 u_char result;
960
961 if (mode >= 0) {
962 /* flush the data and then release and flush the dirty flags */
963 result = make_clean(mode == 0 ? 0 : 1);
964 if (!db_unload(0, 0))
965 result = 0;
966 } else {
967 db_invalidate = 1;
968 rel_db_states();
969 result = (db_unload(0, 0) > 0);
970 }
971
972 /* Close the hash table first because the server is often
973 * waiting for the lock on the main file held by dbclean.
974 * Destroy the hash table if it is bad */
975 if (db_hash_fd >= 0) {
976 if (0 > close(db_hash_fd)) {
977 dcc_pemsg(EX_IOERR, 0, "close(%s): %s",
978 db_hash_nm, ERROR_STR());
979 result = 0;
980 }
981 db_hash_fd = -1;
982 }
983 if (db_fd >= 0) {
984 if (0 > close(db_fd)) {
985 dcc_pemsg(EX_IOERR, 0, "close(%s): %s",
986 db_nm, ERROR_STR());
987 result = 0;
988 }
989 db_fd = -1;
990 }
991
992 db_locked.tv_sec = 0;
993 return result;
994 }
995
996
997
998 /* Delete the hash table if the system is being rebooted and we
999 * don't trust the file system to get all of the hash table. This might
1000 * make system shut down faster */
1001 void
1002 db_stop(void)
1003 {
1004 if (db_hash_fd < 0
1005 || !DB_IS_LOCKED()
1006 || !db_not_synced
1007 || db_hash_nm[0] == '\0')
1008 return;
1009
1010 if (0 > unlink(db_hash_nm)
1011 && errno != ENOENT)
1012 dcc_error_msg("unlink(%s): %s", db_hash_nm, ERROR_STR());
1013 }
1014
1015
1016
1017 /* see if (another) instance of dbclean is already running */
1018 static int dbclean_lock_fd = -1;
1019 static DCC_PATH dbclean_lock_nm;
1020
1021 u_char /* 1=no (other) dbclean */
1022 lock_dbclean(DCC_EMSG emsg, const char *cur_db_nm)
1023 {
1024 char pid[32];
1025 int i;
1026
1027 fnm2rel_good(dbclean_lock_nm, cur_db_nm, DB_LOCK_SUFFIX);
1028 dbclean_lock_fd = dcc_lock_open(emsg, dbclean_lock_nm,
1029 O_RDWR|O_CREAT,
1030 DCC_LOCK_OPEN_NOWAIT,
1031 DCC_LOCK_ALL_FILE, 0);
1032 if (dbclean_lock_fd < 0)
1033 return 0;
1034
1035 i = 1+snprintf(pid, sizeof(pid), "%ld\n", (long)getpid());
1036 if (i != write(dbclean_lock_fd, pid, i))
1037 dcc_logbad(EX_IOERR, "write(%s, pid): %s",
1038 dbclean_lock_nm, ERROR_STR());
1039
1040 /* Let anyone write in it in case we are running as root
1041 * and get interrupted by a crash or gdb. A stray, stale
1042 * private lock file cannot be locked */
1043 chmod(dbclean_lock_nm, 0666);
1044
1045 return 1;
1046 }
1047
1048
1049
1050 void
1051 unlock_dbclean(void)
1052 {
1053 if (dbclean_lock_fd >= 0) {
1054 if (0 > unlink(dbclean_lock_nm))
1055 dcc_error_msg("unlink(%s): %s",
1056 dbclean_lock_nm, ERROR_STR());
1057 close(dbclean_lock_fd);
1058 dbclean_lock_fd = -1;
1059 }
1060 }
1061
1062
1063
1064 /* This locking does only multiple-readers/single-writer */
1065 int /* -1=failed, 0=was not locked, 1=was */
1066 db_lock(void)
1067 {
1068 struct stat sb;
1069
1070 if (DB_IS_LOCKED())
1071 return 1;
1072
1073 if (!dcc_exlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, 15*60, "", db_nm))
1074 return -1;
1075 if (0 > fstat(db_fd, &sb)) {
1076 db_failure(__LINE__,__FILE__, EX_IOERR, 0,
1077 "stat(%s): %s", db_nm, ERROR_STR());
1078 return -1;
1079 }
1080 if (db_fsize != (DB_HOFF)sb.st_size) {
1081 if (db_fsize > (DB_HOFF)sb.st_size || !db_rdonly) {
1082 db_failure(__LINE__,__FILE__, EX_IOERR, 0,
1083 "%s size changed from "OFF_HPAT
1084 " to "OFF_HPAT,
1085 db_nm, db_fsize, sb.st_size);
1086 return -1;
1087 }
1088 db_fsize = sb.st_size;
1089 }
1090
1091 db_locked = db_time;
1092 return 0;
1093 }
1094
1095
1096
1097 /* flush buffers to make the disk reasonably correct but not perfect
1098 * This does not compensate for a lack of coherent mmap() in the system.
1099 *
1100 * It leaves the disk only as accurate as implied by db_not_synced.
1101 * This flushes buffers marked either urgent and ordinarily dirty.
1102 * If db_not_synced is set, then non-urgent dirty bits are not set. */
1103 static u_char
1104 make_clean_flush(void)
1105 {
1106 DB_BUF *b;
1107 u_char result;
1108
1109 result = 1;
1110 for (b = buf_oldest; b != 0; b = b->newer) {
1111 if (b->buf_type == DB_BUF_TYPE_FREE)
1112 continue;
1113 if (!buf_flush(0, b, 0))
1114 result = 0;
1115 }
1116
1117 return result;
1118 }
1119
1120
1121
1122 /* push all of our database changes to the disk and try to clear the dirty bit
1123 * do not necessarily unmap anything */
1124 u_char
1125 make_clean(u_char clean) /* 0=leave hash marked dirty, */
1126 { /* 1=marked clean, 2=fsync */
1127 u_char need_db_fsync, result;
1128 struct stat sb;
1129
1130 rel_db_states();
1131
1132 result = 1;
1133
1134 /* quit if we are giving up */
1135 if (db_invalidate)
1136 return result;
1137
1138 if (db_failed_line)
1139 clean = 0;
1140
1141 if (!make_clean_flush()) {
1142 clean = 0;
1143 result = 0;
1144 }
1145
1146 /* simply unlock all of the buffers if they are clean
1147 * and do not need to (or cannot) be synchronized with fsync() */
1148 if (!db_dirty
1149 && (clean < 2 /* not asked to synchronize */
1150 || db_rdonly /* cannot be synchronized */
1151 || !db_not_synced)) /* does not need to be synchronized */
1152 return result;
1153
1154 need_db_fsync = (clean == 2);
1155
1156 /* Send the meta-data to disk so that other processes
1157 * such as dbclean can find the new length of the file
1158 * on Solaris. Otherwise the file looks broken because
1159 * its contained data length can be larger than its
1160 * inode size on Solaris. */
1161 if (!need_db_fsync && clean) {
1162 if (0 > fstat(db_fd, &sb)) {
1163 dcc_error_msg("make_clean fstat(%s): %s",
1164 db_nm, ERROR_STR());
1165 need_db_fsync = 1;
1166 } else if (db_fsize != (DB_HOFF)sb.st_size) {
1167 if (db_debug)
1168 quiet_trace_msg("need fsync() because db_fsize="
1169 OFF_HPAT" but stat="OFF_HPAT,
1170 db_fsize, sb.st_size);
1171 need_db_fsync = 1;
1172 }
1173 }
1174
1175 if (need_db_fsync
1176 && 0 > fsync(db_fd)) {
1177 dcc_error_msg("make_clean fsync(%s): %s",
1178 db_nm, ERROR_STR());
1179 clean = 0;
1180 result = 0;
1181 }
1182
1183 if (clean && !map_hash_ctl(0, 0)) {
1184 clean = 0;
1185 result = 0;
1186 }
1187 if (clean == 2) {
1188 if (0 > fsync(db_hash_fd)) {
1189 dcc_error_msg("make_clean fsync(%s): %s",
1190 db_hash_nm, ERROR_STR());
1191 clean = 0;
1192 result = 0;
1193 } else {
1194 db_not_synced = 0;
1195 db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_NOSYNC;
1196 SET_FLUSH_HCTL(1);
1197 if (!make_clean_flush()) {
1198 clean = 0;
1199 result = 0;
1200 }
1201 }
1202 }
1203
1204 /* Clean the dirty flag in the hash table.
1205 * With luck, this will reach the disk after everything else. */
1206 if (clean
1207 && !(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_CLEAN)) {
1208 db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_CLEAN;
1209 SET_FLUSH_HCTL(0);
1210 }
1211
1212 /* finally flush the flag in the hash table */
1213 rel_db_states();
1214 if (!make_clean_flush())
1215 result = 0;
1216
1217 if (clean)
1218 db_dirty = 0;
1219 return result;
1220 }
1221
1222
1223
1224 /* mark the hash file and so the database dirty */
1225 static u_char
1226 db_make_dirty(DCC_EMSG emsg)
1227 {
1228 if (db_dirty)
1229 return 1;
1230
1231 if (!DB_IS_LOCKED()) {
1232 dcc_logbad(EX_SOFTWARE, "dirtying unlocked database");
1233 return 0;
1234 }
1235
1236 if (db_rdonly)
1237 dcc_logbad(EX_SOFTWARE, "dirtying read-only database");
1238
1239 if (!map_hash_ctl(emsg, 0))
1240 return 0;
1241 db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_CLEAN;
1242 #ifdef USE_MAP_NOSYNC
1243 if (!(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_NOSYNC)) {
1244 db_sts.hash_ctl.d.vals->s.synced = time(0);
1245 db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC;
1246 }
1247 db_not_synced = 1;
1248 #endif
1249
1250 SET_FLUSH_HCTL(1);
1251 if (!buf_flush_part(emsg, db_sts.hash_ctl.b, 0, 0))
1252 return 0;
1253
1254 db_dirty = 1;
1255 return 1;
1256 }
1257
1258
1259
1260 /* (start to) unlock the database */
1261 u_char /* 0=failed, 1=at least started */
1262 db_unlock(void)
1263 {
1264 DB_BUF *b;
1265 int result;
1266
1267 if (!DB_IS_LOCKED())
1268 return 1;
1269
1270 /* Clear the dirty bit in the database because we may not
1271 * be able to lock the database later to clear the dirty bit.
1272 * Dbclean needs to see the dirty bit clear. */
1273 result = make_clean(1);
1274
1275 /* Release DB_BUF_FG_USE_WRITE buffers because they are not consistent
1276 * among processes
1277 * Release everything if dccd wants stay out of RAM in favor
1278 * of dbclean */
1279 for (b = buf_oldest; b != 0; b = b->newer) {
1280 if (b->buf_type == DB_BUF_TYPE_FREE)
1281 continue;
1282 if (db_minimum_map
1283 || (b->flags & DB_BUF_FG_USE_WRITE))
1284 buf_munmap(0, b);
1285 }
1286
1287 if (!dcc_unlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, "", db_nm))
1288 result = 0;
1289 db_locked.tv_sec = 0;
1290 return result;
1291 }
1292
1293
1294
1295 static const char *
1296 mbyte2str(DB_PTR val)
1297 {
1298 return db_ptr2str(val*1024*1024);
1299 }
1300
1301
1302
1303 #if defined(RLIMIT_AS) || defined(RLIMIT_RSS) || defined(RLIMIT_FSIZE)
1304 static DB_PTR
1305 use_rlimit(int resource, const char *rlimit_nm,
1306 DB_PTR cur_val, DB_PTR min_val, const char *val_nm)
1307 {
1308 struct rlimit limit_old, limit_new;
1309 DB_PTR new_val;
1310
1311 if (0 > getrlimit(resource, &limit_old)) {
1312 dcc_error_msg("getrlimit(%s): %s", rlimit_nm, ERROR_STR());
1313 return cur_val;
1314 }
1315
1316 if ((DB_PTR)limit_old.rlim_cur >= cur_val+DB_PAD_MBYTE*1024)
1317 return cur_val;
1318
1319 /* assume we are root and try to increase the hard limit */
1320 if ((DB_PTR)limit_new.rlim_max < cur_val+DB_PAD_BYTE) {
1321 limit_new = limit_old;
1322 limit_new.rlim_max = cur_val+DB_PAD_BYTE;
1323 if (0 > setrlimit(resource, &limit_new)) {
1324 if (db_debug)
1325 quiet_trace_msg("setrlimit(%s, "
1326 L_DPAT","L_DPAT"): %s",
1327 rlimit_nm,
1328 (DB_PTR)limit_new.rlim_cur,
1329 (DB_PTR)limit_new.rlim_max,
1330 ERROR_STR());
1331 } else {
1332 if (0 > getrlimit(resource, &limit_old)) {
1333 dcc_error_msg("getrlimit(%s): %s",
1334 rlimit_nm, ERROR_STR());
1335 return cur_val;
1336 }
1337 }
1338 }
1339
1340 limit_new = limit_old;
1341 if ((DB_PTR)limit_new.rlim_max < min_val+DB_PAD_BYTE)
1342 limit_new.rlim_max = min_val + DB_PAD_BYTE;
1343 limit_new.rlim_cur = limit_new.rlim_max;
1344 if ((DB_PTR)limit_new.rlim_cur > cur_val+DB_PAD_BYTE)
1345 limit_new.rlim_cur = cur_val+DB_PAD_BYTE;
1346 if (0 > setrlimit(resource, &limit_new)) {
1347 dcc_error_msg("setrlimit(%s, "L_DPAT","L_DPAT"): %s",
1348 rlimit_nm,
1349 (DB_PTR)limit_new.rlim_cur,
1350 (DB_PTR)limit_new.rlim_max,
1351 ERROR_STR());
1352 new_val = limit_old.rlim_cur - DB_PAD_BYTE;
1353 if (new_val < min_val)
1354 new_val = min_val;
1355 } else {
1356 if (limit_old.rlim_cur < limit_new.rlim_cur
1357 && db_debug)
1358 quiet_trace_msg("increased %s from %s to %s",
1359 rlimit_nm,
1360 db_ptr2str(limit_old.rlim_cur),
1361 #ifdef RLIM_INFINITY
1362 (limit_new.rlim_cur == RLIM_INFINITY)
1363 ? "infinity" :
1364 #endif
1365 db_ptr2str(limit_new.rlim_cur));
1366 new_val = limit_new.rlim_cur - DB_PAD_BYTE;
1367 }
1368
1369 if (cur_val > new_val) {
1370 quiet_trace_msg("%s reduced %s from %s to %s",
1371 rlimit_nm, val_nm,
1372 db_ptr2str(cur_val),
1373 db_ptr2str(new_val));
1374 return new_val;
1375 }
1376
1377 return cur_val;
1378 }
1379 #endif
1380
1381
1382
1383 static void
1384 get_db_max_rss(void)
1385 {
1386 DB_PTR old_val, new_val, db_min_mbyte, db_min_byte, db_max_mbyte;
1387 int physmem_str_len;
1388 DB_PTR physmem;
1389
1390 /* use default maximum if maximum is bogus or unset by ./configure */
1391 db_max_mbyte = MAX_MAX_DB_MBYTE;
1392 #if DB_MAX_MBYTE != 0
1393 db_max_mbyte = DB_MAX_MBYTE;
1394 if (db_max_mbyte < DB_MIN_MIN_MBYTE
1395 || db_max_mbyte > MAX_MAX_DB_MBYTE) {
1396 quiet_trace_msg("ignore bad ./configure --with-max-db-mem=%d",
1397 DB_MAX_MBYTE);
1398 db_max_mbyte = MAX_MAX_DB_MBYTE;
1399 } else if (db_debug) {
1400 quiet_trace_msg("DB max=%s"
1401 " from ./configure --with-max-db-mem=%d",
1402 mbyte2str(db_max_mbyte), DB_MAX_MBYTE);
1403 }
1404 #endif
1405 #ifndef HAVE_BIG_FILES
1406 /* we need big off_t for files larger than 2 GBytes */
1407 if (db_max_mbyte > DB_MAX_2G_MBYTE) {
1408 old_val = db_max_mbyte;
1409 db_max_mbyte= DB_MAX_2G_MBYTE;
1410 if (db_debug)
1411 quiet_trace_msg("32-bit off_t reduced DB max from %s"
1412 " to %s",
1413 mbyte2str(old_val),
1414 mbyte2str(db_max_mbyte));
1415 }
1416 #endif
1417
1418 /* use default if ./configure --with-db-memory=MB is bogus or unset */
1419 #if DB_MIN_MBYTE == 0
1420 db_min_mbyte = 64;
1421 #else
1422 db_min_mbyte = DB_MIN_MBYTE;
1423 if (db_min_mbyte < DB_MIN_MIN_MBYTE) {
1424 quiet_trace_msg("ignore bad ./configure --with-db-memory=%d",
1425 DB_MIN_MBYTE);
1426 db_min_mbyte = DB_DEF_MIN_MBYTE;
1427 } else if (db_min_mbyte > db_max_mbyte) {
1428 quiet_trace_msg("ignore ./configure --with-db-memory=%d"
1429 " > DB max=%s",
1430 mbyte2str(db_max_mbyte));
1431 db_min_mbyte = DB_DEF_MIN_MBYTE;
1432 } else if (db_debug) {
1433 quiet_trace_msg("use ./configure --with-db-memory=%d",
1434 DB_MIN_MBYTE);
1435 }
1436 #endif
1437
1438 db_min_byte = db_min_mbyte * (1024*1024);
1439 db_max_byte = db_max_mbyte * (1024*1024);
1440
1441 #ifdef RLIMIT_FSIZE
1442 db_max_mbyte = (use_rlimit(RLIMIT_FSIZE, "RLIMIT_FSIZE",
1443 db_max_byte, db_min_byte, "DB max")
1444 / (1024*1024));
1445 db_max_byte = db_max_mbyte * (1024*1024);
1446 #endif /* RLIMIT_FSIZE */
1447
1448 physmem = 0;
1449 #ifdef HAVE_PHYSMEM_TOTAL
1450 /* maybe someday physmem_total() will be widely available */
1451 physmem = physmem_total();
1452 if (db_debug)
1453 quiet_trace_msg("real=%s from physmem_total()",
1454 db_ptr2str(physmem));
1455 #endif
1456 #ifdef HAVE__SC_PHYS_PAGES
1457 if (physmem == 0) {
1458 long pages, sizepage;
1459
1460 if ((pages = sysconf(_SC_PHYS_PAGES)) == -1) {
1461 dcc_error_msg("sysconf(_SC_PHYS_PAGES): %s",
1462 ERROR_STR());
1463 } else if ((sizepage = sysconf(_SC_PAGESIZE)) == -1) {
1464 dcc_error_msg("sysconf(_SC_PAGESIZE): %s",
1465 ERROR_STR());
1466 } else {
1467 physmem = (DB_PTR)pages * (DB_PTR)sizepage;
1468 if (db_debug)
1469 quiet_trace_msg("real=%s"
1470 " from sysconf(_SC_PHYS_PAGES)"
1471 " and sysconf(_SC_PAGESIZE)",
1472 db_ptr2str(physmem));
1473 }
1474 }
1475 #endif
1476 #ifdef HAVE_HW_PHYSMEM
1477 if (physmem == 0) {
1478 int mib[2] = {CTL_HW, HW_PHYSMEM};
1479 unsigned long int hw_physmem;
1480 size_t hw_physmem_len;
1481
1482 hw_physmem_len = sizeof(hw_physmem);
1483 if (0 > sysctl(mib, 2, &hw_physmem, &hw_physmem_len, 0,0)) {
1484 dcc_error_msg("sysctl(HW_PHYSMEM): %s", ERROR_STR());
1485 } else {
1486 physmem = hw_physmem;
1487 if (db_debug)
1488 quiet_trace_msg("real=%s from sysctl(mib)",
1489 db_ptr2str(physmem));
1490 }
1491 }
1492 #endif
1493 #ifdef HAVE_PSTAT_GETSTATIC
1494 if (physmem == 0) {
1495 struct pst_static pss;
1496
1497 if (0 > pstat_getstatic(&pss, sizeof pss, 1, 0)) {
1498 dcc_error_msg("pstat_getstatic(): %s", ERROR_STR());
1499 } else if (pss.physical_memory <= 0
1500 || pss.page_size < 0) {
1501 dcc_error_msg("pstat_getstatic() says"
1502 " physical_memory=%d page_size=%d",
1503 pss.physical_memory, pss.page_size);
1504 } else {
1505 physmem = ((DB_PTR)pss.physical_memory
1506 * (DB_PTR)pss.page_size);
1507 if (db_debug)
1508 quiet_trace_msg("real=%s"
1509 " from pstat_getstatic()",
1510 db_ptr2str(physmem));
1511 }
1512 }
1513 #endif
1514
1515 physmem_str_len = 0;
1516 db_physmem_str[0] = '\0';
1517 if (physmem == 0) {
1518 quiet_trace_msg("failed to get real memory size");
1519 } else {
1520 physmem_str_len = snprintf(db_physmem_str,
1521 sizeof(db_physmem_str),
1522 " real=%s",
1523 db_ptr2str(physmem));
1524
1525 /* Try to use half of physical memory
1526 * if there is less than 2 GByte
1527 * all except 512 MByte between 2 GByte and 4 GByte,
1528 * and all but 1 GByte if there is more than 4 GByte */
1529 if (physmem/(1024*1024) < 2*1024)
1530 new_val = physmem/2;
1531 else if (physmem/(1024*1024) <= 4*1024)
1532 new_val = physmem - 512*(1024*1024);
1533 else
1534 new_val = physmem - 1024*(1024*1024);
1535 if (new_val < db_min_byte) {
1536 if (db_debug)
1537 quiet_trace_msg("real=%s would give DB max=%s"
1538 " smaller than minimum %s",
1539 db_ptr2str(physmem),
1540 db_ptr2str(new_val),
1541 mbyte2str(db_min_mbyte));
1542 new_val = db_min_byte;
1543 }
1544 if (db_max_byte > new_val) {
1545 old_val = db_max_byte;
1546 db_max_mbyte = new_val / (1024*1024);
1547 db_max_byte = db_max_mbyte * (1024*1024);
1548 if (db_debug)
1549 quiet_trace_msg("real=%s reduced DB max"
1550 " from %s to %s",
1551 db_ptr2str(physmem),
1552 db_ptr2str(old_val),
1553 db_ptr2str(db_max_byte));
1554 }
1555 }
1556
1557 /* window need not be larger than the limit on the database size */
1558 db_max_rss = db_max_byte;
1559
1560 #ifdef RLIMIT_AS
1561 /* try not to break process virtual memory limit,
1562 * but only if it is not ridiculously tiny */
1563 db_max_rss = use_rlimit(RLIMIT_AS, "RLIMIT_AS",
1564 db_max_rss, db_min_byte, "max RSS");
1565 #endif /* RLIMIT_AS */
1566 #ifdef RLIMIT_RSS
1567 /* try not to break process resident memory limit
1568 * but only if it is not ridiculously tiny */
1569 db_max_rss = use_rlimit(RLIMIT_RSS, "RLIMIT_RSS",
1570 db_max_rss, db_min_byte, "max RSS");
1571 #endif /* RLIMIT_RSS */
1572
1573 /* limit the database to the window size */
1574 if (db_max_byte > db_max_rss) {
1575 old_val = db_max_mbyte;
1576 db_max_mbyte = db_max_rss / (1024*1024);
1577 db_max_byte = db_max_mbyte * (1024*1024);
1578 if (db_debug)
1579 quiet_trace_msg("max RSS reduced DB max from %s to %s",
1580 mbyte2str(old_val),
1581 mbyte2str(db_max_mbyte));
1582 }
1583
1584 #ifndef HAVE_64BIT_PTR
1585 /* We cannot use a window larger than 2 GBytes on most systems without
1586 * big pointers. Among the things that break is trying to mmap() more
1587 * than 2 GBytes. So limit the window on 32-bit systems to a little
1588 * less than 2 GBytes and the database to not much more */
1589 if (db_max_rss > DB_MAX_2G_MBYTE*(1024*1024)) {
1590 if (db_debug)
1591 quiet_trace_msg("32-bit pointers reduced max RSS"
1592 " from %s to %s",
1593 db_ptr2str(db_max_rss),
1594 mbyte2str(DB_MAX_2G_MBYTE));
1595 db_max_rss = DB_MAX_2G_MBYTE*(1024*1024);
1596 new_val = db_max_rss+db_max_rss/4;
1597 if (db_max_byte > new_val) {
1598 old_val = db_max_mbyte;
1599 db_max_mbyte = new_val / (1024*1024);
1600 db_max_byte = db_max_mbyte * (1024*1024);
1601 if (db_debug)
1602 quiet_trace_msg("32-bit pointers reduced DB max"
1603 " from %s to %s",
1604 mbyte2str(old_val),
1605 mbyte2str(db_max_mbyte));
1606 }
1607 }
1608 #endif
1609
1610 snprintf(&db_physmem_str[physmem_str_len],
1611 sizeof(db_physmem_str) - physmem_str_len,
1612 " max RSS=%s DB max=%s",
1613 db_ptr2str(db_max_rss), mbyte2str(db_max_mbyte));
1614 }
1615
1616
1617
1618 /* Pick a buffer size that will hold an integral number of DB hash
1619 * table entries and is a multiple of system's page size.
1620 * The entire hash table should reside in memory
1621 * if the system has enough memory. */
1622 u_int
1623 db_get_pagesize(u_int old_pagesize, /* 0 or required page size */
1624 u_int tgt_pagesize) /* 0 or target page size */
1625 {
1626 u_int min_pagesize, max_pagesize;
1627
1628 /* Ask the operating system only once so we don't get differing
1629 * answers and so compute a varying page size.
1630 * Some systems can't keep their stories straight. */
1631 if (db_max_rss == 0)
1632 get_db_max_rss();
1633
1634 /* Compute the least common multiple of the system page and
1635 * the DB hash table entry size.
1636 * This will give us the smallest page size that we can use. */
1637 system_pagesize = getpagesize();
1638 min_pagesize = lcm(system_pagesize, sizeof(HASH_ENTRY));
1639
1640 /* The kludge to speed conversion of database addresses to page numbers
1641 * and offsets on 32-bit systems depends on the page size being
1642 * a multiple of 256 */
1643 if ((min_pagesize % (1<<DB_PTR_SHIFT)) != 0)
1644 dcc_logbad(EX_SOFTWARE, "page size not a multiple of 256");
1645
1646 /* The DB buffer or page size must also be a multiple of the
1647 * the end-of-page padding used in the main database file. */
1648 if (sizeof(DB_RCD) % DB_RCD_HDR_LEN != 0)
1649 dcc_logbad(EX_SOFTWARE,
1650 "DB padding size %d"
1651 " is not a divisor of DB entry size %d",
1652 DB_RCD_HDR_LEN, ISZ(DB_RCD));
1653 if (DB_RCD_LEN_MAX % DB_RCD_HDR_LEN != 0)
1654 dcc_logbad(EX_SOFTWARE,
1655 "DB record not a multiple of header size");
1656 min_pagesize = lcm(min_pagesize, DB_RCD_HDR_LEN);
1657
1658 /* Use the old buffer size if available so we are not confused
1659 * by padding at the ends of the old pages.
1660 * Fail if it is impossible. This should cause dbclean to
1661 * rebuild the database. */
1662 if (old_pagesize != 0) {
1663 if ((old_pagesize % min_pagesize) != 0)
1664 return 0;
1665 /* adjust the number of buffers to fit our window size */
1666 db_buf_total = db_max_rss / old_pagesize;
1667 if (db_buf_total < (int)DB_BUF_MIN)
1668 return 0;
1669 if (db_buf_total > DB_BUF_MAX)
1670 db_buf_total = DB_BUF_MAX;
1671 return old_pagesize;
1672 }
1673
1674 db_buf_total = DB_BUF_MAX;
1675 max_pagesize = db_max_rss / db_buf_total;
1676 max_pagesize -= max_pagesize % min_pagesize;
1677
1678 /* If we have a target page size, try to use it instead of the
1679 * maximum page size allowed by the resident set size.
1680 * Normal DCC databases grow large and want pages as large as possible
1681 * but greylist databases are often small.
1682 * We also want a tiny page when first reading the parameters while
1683 * opening. */
1684 if (tgt_pagesize != 0 && tgt_pagesize < max_pagesize) {
1685 tgt_pagesize -= tgt_pagesize % min_pagesize;
1686 if (tgt_pagesize < min_pagesize)
1687 tgt_pagesize = min_pagesize;
1688 return tgt_pagesize;
1689 } else if (max_pagesize > min_pagesize) {
1690 return max_pagesize;
1691 } else {
1692 return min_pagesize;
1693 }
1694 }
1695
1696
1697
1698 /* (re)create the buffer pool
1699 * The buffers are small blocks that point to the real mmap()'ed memory.
1700 */
1701 u_char
1702 db_buf_init(u_int old_pagesize, /* 0 or required page size */
1703 u_int tgt_pagesize) /* 0 or target page size */
1704 {
1705 DB_BUF *b, *bprev, *bnext;
1706 int i;
1707
1708
1709 db_pagesize = db_get_pagesize(old_pagesize, tgt_pagesize);
1710 if (db_pagesize == 0)
1711 return 0;
1712
1713 /* The fragments of pages must be multiples of system pages
1714 * so that msync() on Solaris can be given multiples of system
1715 * pages. It's also a generally good idea. */
1716 db_pagesize_part = db_pagesize/DB_BUF_NUM_PARTS;
1717 db_pagesize_part = ((db_pagesize_part + system_pagesize-1)
1718 / system_pagesize) * system_pagesize;
1719
1720 db_page_max = db_pagesize - DB_RCD_HDR_LEN;
1721 db_hash_page_len = db_pagesize/sizeof(HASH_ENTRY);
1722
1723 db_max_hash_entries = (MAX_HASH_ENTRIES
1724 - MAX_HASH_ENTRIES % db_hash_page_len);
1725
1726 memset(db_bufs, 0, sizeof(db_bufs));
1727 b = db_bufs;
1728 buf_oldest = b;
1729 bprev = 0;
1730 for (i = db_buf_total; --i != 0; b = bnext) {
1731 bnext = b+1;
1732 b->older = bprev;
1733 b->newer = bnext;
1734 bprev = b;
1735 }
1736 b->older = bprev;
1737 buf_newest = b;
1738
1739 memset(db_buf_hash, 0, sizeof(db_buf_hash));
1740
1741 return 1;
1742 }
1743
1744
1745
1746 static u_char
1747 make_new_hash(DCC_EMSG emsg, DB_HADDR new_hash_len)
1748 {
1749 struct stat sb;
1750 HASH_ENTRY *hash;
1751 DB_HADDR next_haddr, cur_haddr, prev_haddr;
1752 u_int pagenum;
1753
1754 if (getuid() == 0) {
1755 /* if we are running as root,
1756 * don't change the owner of the database */
1757 if (0 > fstat(db_fd, &sb)) {
1758 dcc_pemsg(EX_IOERR, emsg, "fstat(%s): %s",
1759 db_nm, ERROR_STR());
1760 return 0;
1761 }
1762 if (0 > fchown(db_hash_fd, sb.st_uid, sb.st_gid)) {
1763 dcc_pemsg(EX_IOERR, emsg, "fchown(%s,%d,%d): %s",
1764 db_hash_nm, (int)sb.st_uid, (int)sb.st_gid,
1765 ERROR_STR());
1766 return 0;
1767 }
1768 }
1769
1770 if (new_hash_len < MIN_HASH_ENTRIES)
1771 new_hash_len = MIN_HASH_ENTRIES;
1772
1773 /* Increase the requested hash table size to a multiple of the database
1774 * page size. The page size is chosen to be a multiple of the size of
1775 * a single hash table entry. */
1776 db_hash_fsize = (((DB_HOFF)new_hash_len)*sizeof(HASH_ENTRY)
1777 + db_pagesize-1);
1778 db_hash_fsize -= db_hash_fsize % db_pagesize;
1779 new_hash_len = db_hash_fsize / sizeof(HASH_ENTRY);
1780
1781 if (new_hash_len > db_max_hash_entries)
1782 new_hash_len = db_max_hash_entries;
1783
1784 /* create the empty hash table file */
1785 rel_db_states();
1786 if (!db_unload(emsg, 0))
1787 return 0;
1788 if (0 > ftruncate(db_hash_fd, 0)) {
1789 dcc_pemsg(EX_IOERR, emsg, "truncate(%s,"L_HPAT"): %s",
1790 db_hash_nm, db_csize, ERROR_STR());
1791 return 0;
1792 }
1793
1794 db_hash_len = new_hash_len;
1795 db_hash_used_stored_hash = db_hash_used = DB_HADDR_BASE;
1796 db_hash_divisor = get_db_hash_divisor(db_hash_len);
1797
1798 /* Clear new hash file by linking its entries into the free list */
1799 /* map and clear the first page */
1800 if (!map_hash_ctl(emsg, 1))
1801 return 0;
1802
1803 /* create the header */
1804 strcpy(db_sts.hash_ctl.d.vals->s.magic, HASH_MAGIC_STR);
1805 db_sts.hash_ctl.d.vals->s.free_fwd = DB_HADDR_BASE;
1806 db_sts.hash_ctl.d.vals->s.free_bak = db_hash_len-1;
1807 db_sts.hash_ctl.d.vals->s.len = db_hash_len;
1808 db_sts.hash_ctl.d.vals->s.divisor = db_hash_divisor;
1809 db_sts.hash_ctl.d.vals->s.used = DB_HADDR_BASE;
1810 db_sts.hash_ctl.d.vals->s.synced = time(0);
1811 db_dirty = 1;
1812 #ifdef USE_MAP_NOSYNC
1813 db_sts.hash_ctl.d.vals->s.synced = time(0);
1814 db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC;
1815 db_not_synced = 1;
1816 #endif
1817
1818 /* Link the hash table entries in the first and following pages.
1819 * The page size is chosen to be a multiple of the size of a
1820 * single hash table entry. */
1821 prev_haddr = FREE_HADDR_END;
1822 cur_haddr = DB_HADDR_BASE;
1823 next_haddr = cur_haddr+1;
1824 hash = &db_sts.hash_ctl.d.vals->h[DB_HADDR_BASE];
1825 pagenum = 0;
1826 for (;;) {
1827 do {
1828 DB_HADDR_CP(hash->bak, prev_haddr);
1829 if (next_haddr == db_hash_len)
1830 DB_HADDR_CP(hash->fwd, FREE_HADDR_END);
1831 else
1832 DB_HADDR_CP(hash->fwd, next_haddr);
1833 ++hash;
1834 prev_haddr = cur_haddr;
1835 cur_haddr = next_haddr++;
1836 } while (cur_haddr % db_hash_page_len != 0);
1837
1838 if (++pagenum >= db_hash_fsize/db_pagesize)
1839 break;
1840
1841 if (!map_hash(emsg, cur_haddr, &db_sts.free, 1))
1842 return 0;
1843 db_sts.free.b->flush_urgent = (DB_BUF_FM)-1;
1844 hash = db_sts.free.d.h;
1845 }
1846
1847 hash_clear_pg_num = 0;
1848
1849 return 1;
1850 }
1851
1852
1853
1854 static u_char
1855 check_old_hash(DCC_EMSG emsg)
1856 {
1857 static const u_char magic[sizeof(((HASH_CTL*)0)->s.magic)
1858 ] = HASH_MAGIC_STR;
1859 const HASH_CTL *vals;
1860 struct stat sb;
1861 u_char old_db;
1862
1863 /* check the size of the existing hash file */
1864 if (0 > fstat(db_hash_fd, &sb)) {
1865 dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s",
1866 db_hash_nm, ERROR_STR());
1867 return 0;
1868 }
1869 db_hash_fsize = sb.st_size;
1870 if ((db_hash_fsize % sizeof(HASH_ENTRY)) != 0) {
1871 dcc_pemsg(EX_DATAERR, emsg, "%s has size "OFF_DPAT","
1872 " not a multiple of %d",
1873 db_hash_nm, db_hash_fsize,
1874 ISZ(HASH_ENTRY));
1875 return 0;
1876 }
1877
1878 db_hash_len = db_hash_fsize/sizeof(HASH_ENTRY);
1879 if (db_hash_len < MIN_HASH_ENTRIES) {
1880 dcc_pemsg(EX_DATAERR, emsg,
1881 "%s has too few records, "OFF_DPAT" bytes",
1882 db_hash_nm, db_hash_fsize);
1883 return 0;
1884 }
1885
1886 /* check the magic number */
1887 if (!map_hash_ctl(emsg, 0))
1888 return 0;
1889 vals = db_sts.hash_ctl.d.vals;
1890 if (memcmp(vals->s.magic, &magic, sizeof(magic))) {
1891 dcc_pemsg(EX_DATAERR, emsg,
1892 "%s has the wrong magic \"%.*s\"",
1893 db_hash_nm, ISZ(HASH_ENTRY), vals->s.magic);
1894 return 0;
1895 }
1896
1897 if (!(vals->s.flags & HASH_CTL_FG_CLEAN)) {
1898 dcc_pemsg(EX_DATAERR, emsg, "%s was not closed cleanly",
1899 db_hash_nm);
1900 return 0;
1901 }
1902 if (vals->s.flags & HASH_CTL_FG_NOSYNC) {
1903 #ifdef HAVE_BOOTTIME
1904 int mib[2] = {CTL_KERN, KERN_BOOTTIME};
1905 size_t boottime_len;
1906 #endif
1907 struct timeval boottime;
1908
1909 boottime.tv_sec = 0x7fffffff;
1910 #ifdef HAVE_BOOTTIME
1911 boottime_len = sizeof(boottime);
1912 if (0 > sysctl(mib, 2, &boottime, &boottime_len, 0, 0)) {
1913 dcc_error_msg("sysctl(KERN_BOOTTIME): %s", ERROR_STR());
1914 }
1915 #endif
1916 if (vals->s.synced <= boottime.tv_sec) {
1917 dcc_pemsg(EX_DATAERR, emsg, "%s was not synchronized;"
1918 " synced=%d boottime=%d",
1919 db_hash_nm,
1920 (int)vals->s.synced, (int)boottime.tv_sec);
1921 return 0;
1922 }
1923 db_not_synced = 1;
1924 }
1925
1926 if (DB_HADDR_INVALID(vals->s.free_fwd)
1927 && (vals->s.free_fwd != FREE_HADDR_END
1928 || vals->s.free_fwd != vals->s.free_bak)) {
1929 dcc_pemsg(EX_DATAERR, emsg,
1930 "%s has a broken free list head of %#x",
1931 db_hash_nm, vals->s.free_fwd);
1932 return 0;
1933 }
1934 if (DB_HADDR_INVALID(vals->s.free_bak)
1935 && (vals->s.free_bak != FREE_HADDR_END
1936 || vals->s.free_fwd != vals->s.free_bak)) {
1937 dcc_pemsg(EX_DATAERR, emsg,
1938 "%s has a broken free list tail of %#x",
1939 db_hash_nm, vals->s.free_bak);
1940 return 0;
1941 }
1942
1943 if (db_hash_len != vals->s.len) {
1944 dcc_pemsg(EX_DATAERR, emsg,
1945 "%s has %d entries but claims %d",
1946 db_hash_nm, db_hash_len,
1947 vals->s.len);
1948 return 0;
1949 }
1950
1951 db_hash_divisor = vals->s.divisor;
1952 if (db_hash_divisor < MIN_HASH_DIVISOR
1953 || db_hash_divisor >= db_hash_len) {
1954 dcc_pemsg(EX_DATAERR, emsg, "%s has hash divisor %d",
1955 db_hash_nm, db_hash_len);
1956 return 0;
1957 }
1958
1959 db_hash_used_stored_hash = db_hash_used = vals->s.used;
1960 if (db_hash_used < DB_HADDR_BASE) {
1961 dcc_pemsg(EX_DATAERR, emsg,
1962 "%s contains impossible %u entries",
1963 db_hash_nm, HADDR2LEN(db_hash_used));
1964 return 0;
1965 }
1966 if (db_hash_used >= db_hash_len) {
1967 if (db_hash_used > db_hash_len)
1968 dcc_pemsg(EX_DATAERR, emsg,
1969 "%s contains only %u entries but %u used",
1970 db_hash_nm,
1971 HADDR2LEN(db_hash_len),
1972 HADDR2LEN(db_hash_used));
1973 else
1974 dcc_pemsg(EX_DATAERR, emsg,
1975 "%s is filled with %u entries",
1976 db_hash_nm,
1977 HADDR2LEN(db_hash_len));
1978 return 0;
1979 }
1980
1981 /* old databases lack the growth values */
1982 old_db = 0;
1983 if (!db_rdonly
1984 && db_parms.old_db_csize == 0
1985 && db_parms.db_added == 0
1986 && db_parms.hash_used == 0
1987 && db_parms.old_hash_used == 0
1988 && db_parms.hash_added == 0
1989 && db_parms.rate_secs == 0
1990 && db_parms.last_rate_sec == 0) {
1991 quiet_trace_msg("repair database growth measurements");
1992 db_parms.old_db_csize = db_parms.db_csize;
1993 old_db = 1;
1994 }
1995
1996 if (db_hash_used != db_parms.hash_used
1997 && db_hash_fsize != 0) {
1998 if (old_db) {
1999 quiet_trace_msg("repair db_parms.old hash_used"
2000 " and old_hash_used");
2001 db_parms.old_hash_used = db_hash_used;
2002 db_parms.hash_used = db_hash_used;
2003 } else {
2004 dcc_pemsg(EX_DATAERR, emsg,
2005 "%s contains %d"
2006 " entries instead of the %d that %s claims",
2007 db_hash_nm, db_hash_used,
2008 db_parms.hash_used, db_nm);
2009 return 0;
2010 }
2011 }
2012
2013 db_csize_stored_hash = vals->s.db_csize;
2014 if (db_csize_stored_hash != db_csize
2015 && db_hash_fsize != 0) {
2016 dcc_pemsg(EX_DATAERR, emsg,
2017 "%s contains "L_DPAT
2018 " bytes instead of the "L_DPAT" that %s claims",
2019 db_nm, db_csize,
2020 db_csize_stored_hash, db_hash_nm);
2021 return 0;
2022 }
2023
2024 return 1;
2025 }
2026
2027
2028
2029 /* open the files and generally get ready to work */
2030 u_char /* 0=failed, 1=ok */
2031 db_open(DCC_EMSG emsg,
2032 int new_db_fd, /* -1 or already open db_fd */
2033 const char *new_db_nm,
2034 DB_HADDR new_hash_len, /* 0 or # of entries */
2035 DB_OPEN_MODES mode) /* DB_OPEN_* */
2036 {
2037 u_int cur_pagesize;
2038 int hash_flags, db_open_flags;
2039 struct stat db_sb;
2040 # define OPEN_BAIL() {if (new_db_fd >= 0) db_fd = -1; \
2041 db_close(-1); return 0;}
2042
2043 db_close(1);
2044 db_failed_line = __LINE__;
2045 db_failed_file = __FILE__;
2046 db_not_synced = 0;
2047 db_minimum_map = 0;
2048 db_invalidate = 0;
2049 db_dirty = 0;
2050 db_locked.tv_sec = 0;
2051
2052 db_rdonly = (mode & DB_OPEN_RDONLY) != 0;
2053 db_use_write = (mode & DB_OPEN_MMAP_WRITE) != 0;
2054
2055 memset(&db_stats, 0, sizeof(db_stats));
2056
2057 if (!new_db_nm && db_nm[0] == '\0')
2058 new_db_nm = grey_on ? DB_GREY_NAME : DB_DCC_NAME;
2059 if (new_db_nm) {
2060 if (!fnm2rel(db_nm, new_db_nm, 0)
2061 || !fnm2rel(db_hash_nm, db_nm, DB_HASH_SUFFIX)) {
2062 dcc_pemsg(EX_DATAERR, emsg,
2063 "invalid DB nm \"%s\"", new_db_nm);
2064 return 0;
2065 }
2066 }
2067
2068 if (new_db_fd >= 0) {
2069 if (new_hash_len != 0) {
2070 dcc_logbad(EX_SOFTWARE,
2071 "extending db_open(%s) without locking",
2072 db_nm);
2073 return 0;
2074 }
2075 if (!db_rdonly) {
2076 dcc_logbad(EX_SOFTWARE,
2077 "db_open(%s) read/write without locking",
2078 db_nm);
2079 return 0;
2080 }
2081 db_open_flags = O_RDONLY;
2082 hash_flags = O_RDONLY;
2083
2084 db_fd = new_db_fd;
2085
2086 } else {
2087 db_open_flags = O_RDWR;
2088 if (new_hash_len != 0) {
2089 if (db_rdonly) {
2090 dcc_logbad(EX_SOFTWARE,
2091 "db_open(%s) creating read-only",
2092 db_nm);
2093 return 0;
2094 }
2095 hash_flags = O_RDWR | O_CREAT;
2096 } else {
2097 /* must open the file read/write to lock it */
2098 hash_flags = O_RDWR;
2099 }
2100
2101 db_fd = dcc_lock_open(emsg, db_nm, db_open_flags,
2102 (mode & DB_OPEN_LOCK_NOWAIT)
2103 ? DCC_LOCK_OPEN_NOWAIT
2104 : 0,
2105 DCC_LOCK_ALL_FILE, 0);
2106 if (db_fd == -1) {
2107 db_close(-1);
2108 return 0;
2109 }
2110 }
2111 gettimeofday(&db_time, 0);
2112 db_locked = db_time;
2113 if (0 > fstat(db_fd, &db_sb)) {
2114 dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s", db_nm, ERROR_STR());
2115 OPEN_BAIL();
2116 return 0;
2117 }
2118 db_csize = db_fsize = db_sb.st_size;
2119 if (db_fsize < ISZ(DB_HDR)) {
2120 dcc_pemsg(EX_IOERR, emsg,
2121 "%s with %d bytes is too small to be a DCC database",
2122 db_nm, (int)db_fsize);
2123 OPEN_BAIL();
2124 }
2125
2126 /* check the header of the database file by temporarily mapping it */
2127 db_buf_init(0, sizeof(DB_HDR));
2128 if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0))
2129 OPEN_BAIL();
2130
2131 db_parms_stored = *db_sts.db_parms.d.parms;
2132 db_parms = *db_sts.db_parms.d.parms;
2133
2134 if (memcmp(db_parms.version, db_version_buf, sizeof(db_version_buf))) {
2135 dcc_pemsg(EX_DATAERR, emsg,
2136 "%s contains the wrong magic string \"%.*s\"",
2137 db_nm, ISZ(db_parms.version), db_parms.version);
2138 OPEN_BAIL();
2139 }
2140 if (!(db_parms.flags & DB_PARM_FG_GREY) != !grey_on) {
2141 dcc_pemsg(EX_DATAERR, emsg,
2142 "%s is%s a greylist database but must%s be",
2143 db_nm,
2144 (db_parms.flags & DB_PARM_FG_GREY) ? "" : " not",
2145 grey_on ? "" : " not");
2146 OPEN_BAIL();
2147 }
2148
2149 cur_pagesize = db_parms.pagesize;
2150
2151 DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_INVALID);
2152 DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_FLOD_PATH);
2153 set_db_tholds(db_parms.nokeep_cks);
2154
2155 db_ck_fuzziness = grey_on ? grey_ck_fuzziness : dcc_ck_fuzziness;
2156
2157 db_csize = db_parms.db_csize;
2158 if (db_csize < sizeof(DB_HDR)) {
2159 dcc_pemsg(EX_DATAERR, emsg,
2160 "%s says it contains "L_DPAT" bytes"
2161 " or fewer than the minimum of %d",
2162 db_nm, db_csize, DB_PTR_BASE);
2163 /* that is a fatal error if we are not rebuilding */
2164 if (new_hash_len != 0)
2165 OPEN_BAIL();
2166 }
2167 if (db_csize > db_fsize) {
2168 dcc_pemsg(EX_DATAERR, emsg,
2169 "%s says it contains "L_DPAT" bytes"
2170 " or more than the actual size of "OFF_DPAT,
2171 db_nm, db_csize, db_fsize);
2172 /* that is a fatal error if we are not rebuilding */
2173 if (new_hash_len != 0)
2174 OPEN_BAIL();
2175 }
2176
2177 /* The buffer or page size we use must be the page size used to
2178 * write the files. Try to change our size to match the file */
2179 if (cur_pagesize != db_pagesize) {
2180 db_invalidate = 1;
2181 rel_db_states();
2182 if (!db_unload(emsg, 0))
2183 OPEN_BAIL();
2184 db_invalidate = 0;
2185 if (!db_buf_init(cur_pagesize, 0)) {
2186 dcc_error_msg("%s has page size %d"
2187 " incompatible with %d in %s",
2188 db_nm,
2189 cur_pagesize, db_get_pagesize(0, 0),
2190 path2fnm(db_hash_nm));
2191 OPEN_BAIL();
2192 }
2193 }
2194
2195 db_csize_stored_hash = 0;
2196 db_hash_len = 0;
2197 db_hash_fd = open(db_hash_nm, hash_flags, 0666);
2198 if (db_hash_fd < 0) {
2199 dcc_pemsg(EX_IOERR, emsg, "open(%s): %s",
2200 db_hash_nm, ERROR_STR());
2201 OPEN_BAIL();
2202 }
2203 if (0 > fcntl(db_hash_fd, F_SETFD, FD_CLOEXEC)) {
2204 dcc_pemsg(EX_IOERR, emsg, "fcntl(%s, FD_CLOEXEC): %s",
2205 db_hash_nm, ERROR_STR());
2206 OPEN_BAIL();
2207 }
2208
2209 if (new_hash_len != 0) {
2210 if (!make_new_hash(emsg, new_hash_len))
2211 OPEN_BAIL();
2212 } else {
2213 if (!check_old_hash(emsg))
2214 OPEN_BAIL();
2215 }
2216
2217 if (db_fsize % db_pagesize != 0) {
2218 dcc_pemsg(EX_DATAERR, emsg,
2219 "%s has size "OFF_HPAT","
2220 " not a multiple of its page size of %#x",
2221 db_nm, db_fsize, db_pagesize);
2222 OPEN_BAIL();
2223 }
2224 if (db_fsize > db_csize + db_pagesize || db_csize > db_fsize) {
2225 dcc_pemsg(EX_DATAERR, emsg,
2226 "%s has size "OFF_HPAT" but claims "L_HPAT,
2227 db_nm, db_fsize, db_csize);
2228 OPEN_BAIL();
2229 }
2230
2231 #ifndef USE_MAP_NOSYNC
2232 /* Use `dbclean -F` on systems without mmap(NOSYNC) but with lots of
2233 * RAM. Some Linux systems otherwise take too long to run dbclean. */
2234 if (mode & DB_OPEN_MMAP_WRITE_NOSYNC) {
2235 if (db_max_rss > db_fsize + db_hash_fsize)
2236 db_use_write = 1;
2237 if (db_debug)
2238 quiet_trace_msg("db_max_rss="OFF_HPAT
2239 " db_fsize+db_hash_fsize="OFF_HPAT
2240 " so%s use -F",
2241 db_max_rss, db_fsize+db_hash_fsize,
2242 db_use_write ? "" : " do not");
2243 }
2244 #endif
2245
2246 db_window_size = (DB_PTR)db_pagesize * db_buf_total;
2247 snprintf(db_window_size_str, sizeof(db_window_size_str),
2248 "window=%s%s",
2249 db_ptr2str(db_window_size), db_physmem_str);
2250 rel_db_states();
2251 db_failed_line = 0;
2252
2253 return 1;
2254 #undef OPEN_BAIL
2255 }
2256
2257
2258
2259 static u_char
2260 buf_munmap(DCC_EMSG emsg, DB_BUF *b)
2261 {
2262 u_char result;
2263
2264 if (b->lock_cnt != 0)
2265 dcc_logbad(EX_SOFTWARE, "unmapping locked DB buffer");
2266
2267 result = buf_flush(emsg, b, 1);
2268
2269 if (db_invalidate) {
2270 if (0 > DCC_MADV_FREE(b->buf.v))
2271 dcc_error_msg("madvise(FREE %s,%#x): %s",
2272 buf2path(b), db_pagesize, ERROR_STR());
2273 }
2274
2275 if (0 > munmap(b->buf.v, db_pagesize)) {
2276 db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
2277 "munmap(%s,%d): %s",
2278 buf2path(b), db_pagesize, ERROR_STR());
2279 result = 0;
2280 }
2281 b->buf.v = 0;
2282 b->pg_num = -1;
2283 b->buf_type = DB_BUF_TYPE_FREE;
2284
2285 return result;
2286 }
2287
2288
2289
2290 static u_char
2291 buf_mmap(DCC_EMSG emsg, DB_BUF *b, DB_PG_NUM pg_num, u_char extend)
2292 {
2293 int prot, flags;
2294 off_t offset;
2295 int fd;
2296 void *p;
2297 int retry;
2298 u_char unloaded;
2299
2300
2301 offset = (off_t)pg_num * (off_t)db_pagesize;
2302 fd = buf2fd(b);
2303
2304 if (extend) {
2305 offset = 0;
2306 #if defined(MAP_ANON)|| defined(MAP_ANONYMOUS)
2307 fd = -1;
2308 b->flags |= DB_BUF_FG_USE_WRITE | DB_BUF_FG_EXTENSION;
2309 #ifdef MAP_ANONYMOUS
2310 /* Linux redefines things and requires either MAP_ANON
2311 * or MAP_PRIVATE; */
2312 flags = MAP_ANONYMOUS| MAP_PRIVATE;
2313 #else
2314 flags = MAP_ANON | MAP_PRIVATE;
2315 #endif /* MAP_ANONYMOUS */
2316 #else /* have neither MAP_ANON nor MAP_ANONYMOUS */
2317 b->flags |= DB_BUF_FG_USE_WRITE;
2318 flags = MAP_PRIVATE;
2319 #endif
2320 } else if (db_rdonly) {
2321 flags = MAP_SHARED;
2322 } else if (db_use_write && !db_minimum_map) {
2323 /* write() buffers instead of letting the Solaris virtual
2324 * memory system do it. Solaris will bog the system down doing
2325 * nothing but flushing dirty mmap() pages
2326 * We cannot use this hack in two processes simultaneously,
2327 * so do not use it in dccd while dbclean is running */
2328 b->flags |= DB_BUF_FG_USE_WRITE;
2329 flags = MAP_PRIVATE;
2330 } else {
2331 #ifdef USE_MAP_NOSYNC
2332 flags = (MAP_SHARED | MAP_NOSYNC);
2333 #else
2334 flags = MAP_SHARED;
2335 #endif
2336 }
2337
2338 prot = db_rdonly ? PROT_READ : (PROT_READ | PROT_WRITE);
2339 for (retry = 1, unloaded = 2; unloaded > 1; ++retry) {
2340 p = mmap(0, db_pagesize, prot, flags, fd, offset);
2341
2342 if (p == MAP_FAILED) {
2343 if (errno == EACCES
2344 || errno == EBADF
2345 || errno == EINVAL
2346 || errno == ENODEV
2347 || retry > 20) {
2348 dcc_pemsg(EX_IOERR, emsg,
2349 "try #%d"" mmap(%s"
2350 " %#x,%#x,%#x,%d,"OFF_HPAT"): %s",
2351 retry,
2352 buf2path(b),
2353 db_pagesize, prot, flags, fd, offset,
2354 ERROR_STR());
2355 return 0;
2356 }
2357 dcc_error_msg("try #%d mmap(%s"
2358 " %#x,%#x,%#x,%d,"OFF_HPAT"): %s",
2359 retry,
2360 buf2path(b),
2361 db_pagesize, prot, flags, fd, offset,
2362 ERROR_STR());
2363 /* #define MMAP_FAIL_DEBUG 3 */
2364 #ifdef MMAP_FAIL_DEBUG
2365 } else if (((uint)random() % MMAP_FAIL_DEBUG) == 0) {
2366 /* pretend mmap() failed randomly */
2367 dcc_error_msg(" test fail #%d mmap(%s,%#x,"OFF_HPAT")",
2368 retry,
2369 buf2path(b), db_pagesize, offset);
2370 if (0 > munmap(p, db_pagesize))
2371 dcc_error_msg( "test munmap(): %s",
2372 ERROR_STR());
2373 #endif
2374 } else {
2375 /* It worked.
2376 * Say so if it was not the first attempt. */
2377 if (retry != 1)
2378 dcc_error_msg("try #%d"
2379 " mmap(%s,%#x,"OFF_HPAT") ok",
2380 retry,
2381 buf2path(b), db_pagesize, offset);
2382 break;
2383 }
2384
2385 /* mmap() fails occassionally on some systems,
2386 * so try to release something and try again */
2387 unloaded = db_unload(0, 1);
2388 }
2389
2390
2391 b->buf.v = p;
2392 b->flush = 0;
2393 b->flush_urgent = 0;
2394
2395 if (extend)
2396 return 1;
2397
2398 /* madvise() on some systems including FreeBSD uses a lot of CPU cycles,
2399 * so it should not be done unless it is likely to do significant good.
2400 * Get all of our buffers if there is plenty of memory
2401 * and we are not trying to stay out of the way of dbclean. */
2402 if (!db_minimum_map && db_fsize <= db_max_rss) {
2403 /* The flat file would fit. If the hash table would also
2404 * fit, tell the kernel to be aggressive */
2405 if (db_fsize + db_hash_fsize <= db_max_rss
2406 && 0 > DCC_MADV_WILLNEED(p))
2407 dcc_error_msg("madvise(WILLNEED %s,%#x): %s",
2408 buf2path(b), db_pagesize, ERROR_STR());
2409 } else {
2410 if (0 > DCC_MADV_RANDOM(p))
2411 dcc_error_msg("madvise(RANDOM %s,%#x): %s",
2412 buf2path(b), db_pagesize, ERROR_STR());
2413 }
2414
2415 return 1;
2416 }
2417
2418
2419
2420 /* get a free buffer for a chunk of either the hash table or database files */
2421 static DB_BUF *
2422 get_free_buf(DCC_EMSG emsg, DB_BUF **bh)
2423 {
2424 DB_BUF *b;
2425
2426 /* Look for an unlocked buffer.
2427 * We know there is one because we have more buffers than
2428 * can be locked simultaneously. */
2429 b = buf_oldest;
2430 for (;;) {
2431 if (!b)
2432 dcc_logbad(EX_SOFTWARE, "broken DB buffer MRU chain");
2433 if (!b->lock_cnt)
2434 break;
2435 b = b->newer;
2436 }
2437
2438 /* Found an unlocked buffer.
2439 * Unlink it from its hash chain. */
2440 if (b->fwd)
2441 b->fwd->bak = b->bak;
2442 if (b->bak)
2443 b->bak->fwd = b->fwd;
2444 else if (b->hash)
2445 *b->hash = b->fwd;
2446 if (b->buf_type != DB_BUF_TYPE_FREE) {
2447 if (!buf_munmap(emsg, b))
2448 return 0;
2449 }
2450
2451 b->flags = 0;
2452
2453 /* put it on the new hash chain */
2454 b->bak = 0;
2455 b->hash = bh;
2456 b->fwd = *bh;
2457 *bh = b;
2458 if (b->fwd)
2459 b->fwd->bak = b;
2460
2461 return b;
2462 }
2463
2464
2465
2466 static DB_BUF *
2467 find_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_PG_NUM pg_num)
2468 {
2469 DB_BUF *b, **bh;
2470
2471 bh = DB_BUF_HASH(pg_num, buf_type);
2472 b = *bh;
2473 for (;;) {
2474 if (!b) {
2475 /* we ran off the end of the buffer hash chain,
2476 * so get a free buffer */
2477 b = get_free_buf(emsg, bh);
2478 if (!b)
2479 return 0;
2480 b->buf_type = buf_type;
2481 b->pg_num = pg_num;
2482 break;
2483 }
2484 if (b->buf_type == buf_type
2485 && b->pg_num == pg_num)
2486 break; /* found the buffer we need */
2487
2488 b = b->fwd;
2489 }
2490
2491 /* make the buffer newest */
2492 if (buf_newest != b) {
2493 /* unlink it */
2494 b->newer->older = b->older;
2495 if (b->older)
2496 b->older->newer = b->newer;
2497 else
2498 buf_oldest = b->newer;
2499 /* insert it at the head of the MRU list */
2500 b->newer = 0;
2501 b->older = buf_newest;
2502 buf_newest->newer = b;
2503 buf_newest = b;
2504 }
2505
2506 return b;
2507 }
2508
2509
2510
2511 static DB_BUF *
2512 find_st_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_STATE *st,
2513 DB_PG_NUM pg_num, u_char extend)
2514 {
2515 DB_BUF *b;
2516
2517 /* release previous buffer unless it is the right one */
2518 b = st->b;
2519 if (b) {
2520 if (b->pg_num == pg_num
2521 && b->buf_type == buf_type)
2522 return b; /* already have the target buffer */
2523
2524 st->b = 0;
2525 st->d.v = 0;
2526 if (--b->lock_cnt < 0)
2527 dcc_logbad(EX_SOFTWARE, "bad database buffer lock");
2528 }
2529
2530 /* look for the buffer */
2531 b = find_buf(emsg, buf_type, pg_num);
2532 if (!b)
2533 return 0;
2534
2535 ++b->lock_cnt;
2536 if (b->buf.v) {
2537 if (extend && !(b->flags & DB_BUF_FG_USE_WRITE))
2538 dcc_logbad(EX_SOFTWARE, "extending ordinary buffer");
2539
2540 } else {
2541 /* map it if it was not already known */
2542 if (!buf_mmap(emsg, b, pg_num, extend)) {
2543 b->buf_type = DB_BUF_TYPE_FREE;
2544 b->pg_num = -1;
2545 if (--b->lock_cnt != 0)
2546 dcc_logbad(EX_SOFTWARE,
2547 "stolen database buffer lock %d",
2548 b->lock_cnt);
2549 return 0;
2550 }
2551 if (buf_type == DB_BUF_TYPE_DB)
2552 ++db_stats.db_mmaps;
2553 else if (buf_type == DB_BUF_TYPE_HASH)
2554 ++db_stats.hash_mmaps;
2555 }
2556
2557 st->b = b;
2558 st->d.v = 0;
2559 return b;
2560 }
2561
2562
2563
2564 static u_char
2565 map_hash_ctl(DCC_EMSG emsg, u_char new)
2566 {
2567 DB_BUF *b;
2568
2569 b = find_st_buf(emsg, DB_BUF_TYPE_HASH, &db_sts.hash_ctl, 0, new);
2570 if (!b)
2571 return 0;
2572 db_sts.hash_ctl.s.haddr = 0;
2573 db_sts.hash_ctl.d.v = b->buf.v;
2574 return 1;
2575 }
2576
2577
2578
2579 /* mmap() a hash table entry */
2580 static u_char
2581 map_hash(DCC_EMSG emsg,
2582 DB_HADDR haddr, /* this entry */
2583 DB_STATE *st, /* point this to the entry */
2584 u_char new)
2585 {
2586 DB_PG_NUM pg_num;
2587 DB_PG_OFF pg_off;
2588 DB_BUF *b;
2589
2590 if (haddr >= db_hash_len || haddr < DB_HADDR_BASE) {
2591 dcc_pemsg(EX_DATAERR, emsg, "invalid hash address %#x",
2592 haddr);
2593 return 0;
2594 }
2595
2596 pg_num = haddr / db_hash_page_len;
2597 pg_off = haddr % db_hash_page_len;
2598
2599 b = find_st_buf(emsg, DB_BUF_TYPE_HASH, st, pg_num, new);
2600 if (!b)
2601 return 0;
2602 st->s.haddr = haddr;
2603 st->d.h = &b->buf.h[pg_off];
2604 return 1;
2605 }
2606
2607
2608
2609 /* unlink a hash table entry from the free list
2610 * uses db_sts.tmp */
2611 static u_char
2612 unlink_free_hash(DCC_EMSG emsg,
2613 DB_STATE *hash_st) /* remove this from the free list */
2614 {
2615 DB_HADDR fwd, bak;
2616
2617 if (!db_make_dirty(emsg))
2618 return 0;
2619
2620 fwd = DB_HADDR_EX(hash_st->d.h->fwd);
2621 bak = DB_HADDR_EX(hash_st->d.h->bak);
2622 if (!HE_IS_FREE(hash_st->d.h)
2623 || (DB_HADDR_INVALID(fwd) && fwd != FREE_HADDR_END)
2624 || (DB_HADDR_INVALID(bak) && bak != FREE_HADDR_END)
2625 || DB_HPTR_EX(hash_st->d.h->rcd) != DB_PTR_NULL) {
2626 dcc_pemsg(EX_DATAERR, emsg,
2627 "bad hash free list entry at %#x", hash_st->s.haddr);
2628 return 0;
2629 }
2630
2631 if (fwd != FREE_HADDR_END) {
2632 if (!map_hash(emsg, fwd, &db_sts.tmp, 0))
2633 return 0;
2634 if (DB_HADDR_EX(db_sts.tmp.d.h->bak) != hash_st->s.haddr) {
2635 dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
2636 hash_st->s.haddr, fwd);
2637 return 0;
2638 }
2639 DB_HADDR_CP(db_sts.tmp.d.h->bak, bak);
2640 SET_FLUSH_HE(&db_sts.tmp);
2641 } else {
2642 if (!map_hash_ctl(emsg, 0))
2643 return 0;
2644 if (db_sts.hash_ctl.d.vals->s.free_bak != hash_st->s.haddr) {
2645 dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
2646 hash_st->s.haddr, fwd);
2647 return 0;
2648 }
2649 db_sts.hash_ctl.d.vals->s.free_bak = bak;
2650 SET_FLUSH_HCTL(0);
2651 }
2652
2653 if (bak != FREE_HADDR_END) {
2654 if (!map_hash(emsg, bak, &db_sts.tmp, 0))
2655 return 0;
2656 if (DB_HADDR_EX(db_sts.tmp.d.h->fwd) != hash_st->s.haddr) {
2657 dcc_pemsg(EX_DATAERR, emsg, "bad free %#x <-- free %#x",
2658 bak, hash_st->s.haddr);
2659 return 0;
2660 }
2661 DB_HADDR_CP(db_sts.tmp.d.h->fwd, fwd);
2662 SET_FLUSH_HE(&db_sts.tmp);
2663 } else {
2664 if (!map_hash_ctl(emsg, 0))
2665 return 0;
2666 if (db_sts.hash_ctl.d.vals->s.free_fwd != hash_st->s.haddr) {
2667 dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
2668 hash_st->s.haddr, bak);
2669 return 0;
2670 }
2671 db_sts.hash_ctl.d.vals->s.free_fwd = fwd;
2672 SET_FLUSH_HCTL(0);
2673 }
2674
2675 memset(hash_st->d.h, 0, sizeof(HASH_ENTRY));
2676 SET_FLUSH_HE(hash_st);
2677
2678 ++db_hash_used;
2679 return 1;
2680 }
2681
2682
2683
2684 /* get a free hash table entry and leave db_sts.free pointing to it */
2685 static u_char /* 0=failed, 1=got it */
2686 get_free_hash(DCC_EMSG emsg,
2687 DB_HADDR result) /* try near here */
2688 {
2689 DB_HADDR pg_start, pg_lim, bak;
2690 int i;
2691
2692 if (db_hash_len <= db_hash_used) {
2693 dcc_pemsg(EX_OSFILE, emsg, "no free hash table entry;"
2694 " %d of %d used", db_hash_used, db_hash_len);
2695 return 0;
2696 }
2697
2698 /* Look first near the target */
2699 if (result < DB_HADDR_BASE)
2700 result = DB_HADDR_BASE;
2701 pg_start = result - (result % db_hash_page_len);
2702 pg_lim = pg_start + db_hash_page_len-1;
2703 if (pg_lim >= db_hash_len)
2704 pg_lim = db_hash_len-1;
2705 for (i = 0; i < 3 && ++result < pg_lim; ++i) {
2706 if (!map_hash(emsg, result, &db_sts.free, 0))
2707 return 0;
2708 if (HE_IS_FREE(db_sts.free.d.h))
2709 return unlink_free_hash(emsg, &db_sts.free);
2710 }
2711
2712 /* check the local ad hoc free list at the end of the page */
2713 if (!map_hash(emsg, pg_lim, &db_sts.free, 0))
2714 return 0;
2715 if (HE_IS_FREE(db_sts.free.d.h)) {
2716 /* the ad hoc free list is not empty,
2717 * so try to use the previous entry */
2718 bak = DB_HADDR_EX(db_sts.free.d.h->bak);
2719 if (bak != FREE_HADDR_END) {
2720 if (!map_hash(emsg, bak, &db_sts.free, 0))
2721 return 0;
2722 }
2723 return unlink_free_hash(emsg, &db_sts.free);
2724 }
2725
2726
2727 /* Give up and search from the start of the free list. This happens
2728 * only when the current and all preceding pages are full. */
2729 if (!map_hash_ctl(emsg, 0))
2730 return 0;
2731 result = db_sts.hash_ctl.d.vals->s.free_fwd;
2732 if (DB_HADDR_INVALID(result)) {
2733 dcc_pemsg(EX_DATAERR, emsg,
2734 "broken hash free list head of %#x", result);
2735 return 0;
2736 }
2737 if (!map_hash(emsg, result, &db_sts.free, 0))
2738 return 0;
2739 return unlink_free_hash(emsg, &db_sts.free);
2740 }
2741
2742
2743
2744 /* mmap() a database entry
2745 * We assume that no database entry spans buffers,
2746 * and that there are enough buffers to accomodate all possible
2747 * concurrent requests. */
2748 static u_char
2749 map_db(DCC_EMSG emsg,
2750 DB_PTR rptr, /* address of the record */
2751 u_int tgt_len, /* its length */
2752 DB_STATE *st, /* point this to the record */
2753 u_char extend)
2754 {
2755 DB_PG_NUM pg_num;
2756 DB_PG_OFF pg_off;
2757 DB_BUF *b;
2758
2759 if (rptr+tgt_len > db_fsize) {
2760 db_failure(__LINE__,__FILE__, EX_DATAERR, emsg,
2761 "invalid database address "L_HPAT" or length %d"
2762 " past db_fsize "OFF_HPAT" in %s",
2763 rptr, tgt_len, db_fsize, db_nm);
2764 return 0;
2765 }
2766
2767 /* Try to optimize this to avoid udivdi3() and umoddi3(),
2768 * because they are a major time sink here on 32-bit systems */
2769 pg_num = DB_PTR2PG_NUM(rptr, db_pagesize);
2770 #ifdef HAVE_64BIT_LONG
2771 pg_off = rptr % db_pagesize;
2772 #else
2773 pg_off = rptr - pg_num*(DB_PTR)db_pagesize;
2774 #endif
2775
2776 /* do not go past the end of a buffer */
2777 if (tgt_len+pg_off > db_pagesize) {
2778 db_failure(__LINE__,__FILE__, EX_DATAERR, emsg,
2779 "invalid database address "L_HPAT
2780 " or length %#x in %s",
2781 rptr, tgt_len, db_nm);
2782 return 0;
2783 }
2784
2785 b = find_st_buf(emsg, DB_BUF_TYPE_DB, st, pg_num, extend);
2786 if (!b)
2787 return 0;
2788 st->s.rptr = rptr;
2789 st->d.r = (DB_RCD *)&b->buf.c[pg_off];
2790 return 1;
2791 }
2792
2793
2794
2795 u_char /* 0=failed, 1=got it */
2796 db_map_rcd(DCC_EMSG emsg,
2797 DB_STATE *rcd_st, /* point this to the record */
2798 DB_PTR rptr, /* that is here */
2799 int *rcd_lenp) /* put its length here */
2800 {
2801 u_int rcd_len;
2802
2803 if (DB_PTR_IS_BAD(rptr)) {
2804 dcc_pemsg(EX_DATAERR, emsg,
2805 "getting bogus record at "L_HPAT", in %s",
2806 rptr, db_nm);
2807 return 0;
2808 }
2809
2810 if (!map_db(emsg, rptr, DB_RCD_HDR_LEN, rcd_st, 0))
2811 return 0;
2812 rcd_len = DB_RCD_LEN(rcd_st->d.r);
2813
2814 if (&rcd_st->d.c[rcd_len] > &rcd_st->b->buf.c[db_pagesize]) {
2815 dcc_pemsg(EX_DATAERR, emsg,
2816 "invalid checksum count %d at "L_HPAT" in %s",
2817 DB_NUM_CKS(rcd_st->d.r), rptr, db_nm);
2818 return 0;
2819 }
2820
2821 if (rcd_lenp)
2822 *rcd_lenp = rcd_len;
2823 return 1;
2824 }
2825
2826
2827
2828 /* write the new sizes of the files into the files */
2829 static u_char
2830 db_set_sizes(DCC_EMSG emsg)
2831 {
2832 u_char result = 1;
2833
2834 if (db_hash_fd != -1
2835 && (db_csize_stored_hash != db_csize
2836 || db_hash_used_stored_hash != db_hash_used)) {
2837 if (!map_hash_ctl(emsg, 0)) {
2838 result = 0;
2839 } else {
2840 db_sts.hash_ctl.d.vals->s.db_csize = db_csize;
2841 db_csize_stored_hash = db_csize;
2842
2843 db_sts.hash_ctl.d.vals->s.used = db_hash_used;
2844 db_hash_used_stored_hash = db_hash_used;
2845
2846 SET_FLUSH_HCTL(0);
2847 }
2848 }
2849
2850 if (db_fd != -1
2851 && (db_parms_stored.db_csize != db_csize
2852 || db_parms_stored.hash_used != db_hash_used)) {
2853 if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0)) {
2854 result = 0;
2855 } else {
2856 db_sts.db_parms.d.parms->db_csize = db_csize;
2857 db_parms_stored.db_csize = db_csize;
2858 db_parms.db_csize = db_csize;
2859
2860 db_sts.db_parms.d.parms->hash_used = db_hash_used;
2861 db_parms_stored.hash_used = db_hash_used;
2862 db_parms.hash_used = db_hash_used;
2863
2864 db_sts.db_parms.d.parms->last_rate_sec = db_time.tv_sec;
2865 db_parms_stored.last_rate_sec = db_time.tv_sec;
2866 db_parms.last_rate_sec = db_time.tv_sec;
2867
2868 db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS));
2869 }
2870 }
2871
2872 return result;
2873 }
2874
2875
2876
2877 /* write the database parameters into the magic number headers of the files */
2878 u_char
2879 db_flush_parms(DCC_EMSG emsg)
2880 {
2881 if (!db_set_sizes(emsg))
2882 return 0;
2883
2884 if (db_fd == -1)
2885 return 1;
2886
2887 if (memcmp(&db_parms, &db_parms_stored, sizeof(db_parms))) {
2888 if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0))
2889 return 0;
2890
2891 db_parms.pagesize = db_pagesize;
2892
2893 *db_sts.db_parms.d.parms = db_parms;
2894 db_parms_stored = db_parms;
2895
2896 db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS));
2897 }
2898
2899 return 1;
2900 }
2901
2902
2903
2904 /* find a checksum in an already mapped record */
2905 DB_RCD_CK * /* 0=not found, 1=broken database */
2906 db_find_ck(DCC_EMSG emsg,
2907 DB_RCD *rcd,
2908 DB_PTR rptr,
2909 DCC_CK_TYPES type) /* find this type of checksum */
2910 {
2911 DB_RCD_CK *rcd_ck;
2912 int i;
2913
2914 rcd_ck = rcd->cks;
2915 i = DB_NUM_CKS(rcd);
2916 if (i >= DCC_NUM_CKS) {
2917 dcc_pemsg(EX_DATAERR, emsg,
2918 "impossible %d checksums in "L_HPAT" in %s",
2919 i, rptr, db_nm);
2920 return (DB_RCD_CK *)1;
2921 }
2922
2923 for (; i != 0; --i, ++rcd_ck) {
2924 if (DB_CK_TYPE(rcd_ck) == type)
2925 return rcd_ck;
2926 }
2927
2928 return 0;
2929 }
2930
2931
2932
2933 /* find a checksum type known to be in a record */
2934 DB_RCD_CK * /* 0=it's not there */
2935 db_map_rcd_ck(DCC_EMSG emsg,
2936 DB_STATE *rcd_st, /* point this to the record */
2937 DB_PTR rptr, /* that is here */
2938 DCC_CK_TYPES type) /* find this type of checksum */
2939 {
2940 DB_RCD_CK *rcd_ck;
2941
2942 if (!db_map_rcd(emsg, rcd_st, rptr, 0))
2943 return 0;
2944
2945 rcd_ck = db_find_ck(emsg, rcd_st->d.r, rptr, type);
2946 if (rcd_ck == (DB_RCD_CK *)1)
2947 return 0;
2948 if (rcd_ck == 0) {
2949 dcc_pemsg(EX_DATAERR, emsg,
2950 "missing \"%s\" checksum in "L_HPAT" in %s",
2951 DB_TYPE2STR(type), rptr, db_nm);
2952 return 0;
2953 }
2954 return rcd_ck;
2955 }
2956
2957
2958
2959 static inline u_char /* 1=has a small prime factor */
2960 modulus_has_divisor(DB_HADDR len)
2961 {
2962 static int primes[] = {
2963 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
2964 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
2965 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
2966 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
2967 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
2968 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,
2969 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499};
2970 int *p;
2971
2972 for (p = &primes[0]; p <= LAST(primes); ++p) {
2973 if ((len % *p) == 0)
2974 return 1;
2975 }
2976 return 0;
2977 }
2978
2979
2980
2981 /* Get a modulus for the hash function that is tolerably likely to be
2982 * relatively prime to most inputs. The worst that happens when the modulus
2983 * is composite is that large multiples of its factors will suffer more
2984 * collisions. */
2985 DB_HADDR
2986 get_db_hash_divisor(DB_HADDR len)
2987 {
2988 DB_HADDR divisor;
2989
2990 divisor = len - DB_HADDR_BASE;
2991 if (!(divisor & 1))
2992 --divisor;
2993 while (divisor >= MIN_HASH_ENTRIES) {
2994 if (modulus_has_divisor(divisor))
2995 divisor -= 2;
2996 else
2997 break;
2998 }
2999 return divisor;
3000 }
3001
3002
3003
3004 DB_HADDR
3005 db_hash(DCC_CK_TYPES type, const DCC_SUM sum)
3006 {
3007 u_int64_t accum, wrap;
3008 const u_int32_t *wp;
3009 union {
3010 DCC_SUM sum;
3011 u_int32_t words[4];
3012 } buf;
3013 int align;
3014 DB_HADDR haddr;
3015
3016 #ifdef HAVE_64BIT_PTR
3017 align = (u_int64_t)sum & 3;
3018 #else
3019 align = (u_int)sum & 3;
3020 #endif
3021 if (align == 0) {
3022 /* We almost always take this branch because database
3023 * records contain 12+N*24 bytes. That also implies that
3024 * we should not hope for better than 4 byte alignment. */
3025 wp = (u_int32_t *)sum;
3026 } else {
3027 memcpy(buf.sum, sum, sizeof(buf.sum));
3028 wp = buf.words;
3029 }
3030
3031 /* MD5 checksums are uniformly distributed, and so DCC_SUMs are
3032 * directly useful for hashing except when they are server-IDs */
3033 accum = *wp++;
3034 accum += *wp++;
3035 wrap = accum >>32;
3036 accum <<= 32;
3037 accum += wrap + type;
3038 accum += *wp++;
3039 accum += *wp;
3040
3041 haddr = accum % db_hash_divisor;
3042 haddr += DB_HADDR_BASE;
3043
3044 /* do not hash into the last slot of a page, because it is used to
3045 * find local free slots */
3046 if (haddr % db_hash_page_len == db_hash_page_len-1) {
3047 ++haddr;
3048 if (haddr >= db_hash_len)
3049 haddr = DB_HADDR_BASE;
3050 }
3051 return haddr;
3052 }
3053
3054
3055
3056 /* look for a checksum in the hash table
3057 * return with an excuse, the home slot, or the last entry on
3058 * the collision chain */
3059 DB_FOUND
3060 db_lookup(DCC_EMSG emsg, DCC_CK_TYPES type, const DCC_SUM sum,
3061 DB_HADDR lo, /* postpone if out of this window */
3062 DB_HADDR hi,
3063 DB_STATE *hash_st, /* hash block for record or related */
3064 DB_STATE *rcd_st, /* put the record or garbage here */
3065 DB_RCD_CK **prcd_ck) /* point to cksum if found */
3066 {
3067 DB_HADDR haddr, haddr_fwd, haddr_bak;
3068 DB_PTR db_ptr;
3069 DB_RCD_CK *found_ck;
3070 DB_HADDR failsafe;
3071
3072 haddr = db_hash(type, sum);
3073 if (haddr < lo || haddr > hi) {
3074 if (lo == 0 && hi == MAX_HASH_ENTRIES) {
3075 dcc_pemsg(EX_DATAERR, emsg,
3076 "out of range hash address");
3077 return DB_FOUND_SYSERR;
3078 }
3079 return DB_FOUND_LATER;
3080 }
3081
3082 if (prcd_ck)
3083 *prcd_ck = 0;
3084
3085 if (!map_hash(emsg, haddr, hash_st, 0))
3086 return DB_FOUND_SYSERR;
3087
3088 if (HE_IS_FREE(hash_st->d.h))
3089 return DB_FOUND_EMPTY;
3090
3091 if (!DB_HADDR_C_NULL(hash_st->d.h->bak))
3092 return DB_FOUND_INTRUDER;
3093
3094 /* We know that the current hash table entry is in its home slot.
3095 * It might be for the key or checksum we are looking for
3096 * or it might be for some other checksum with the same hash value. */
3097 for (failsafe = 0; failsafe <= db_hash_len; ++failsafe) {
3098 if (HE_CMP(hash_st->d.h, type, sum)) {
3099 /* This hash table entry could be for our target
3100 * checksum. Read the corresponding record so we
3101 * decide whether we have a hash collision or we
3102 * have found a record containing our target checksum.
3103 *
3104 * find right type of checksum in the record */
3105 db_ptr = DB_HPTR_EX(hash_st->d.h->rcd);
3106 found_ck = db_map_rcd_ck(emsg, rcd_st, db_ptr, type);
3107 if (!found_ck)
3108 return DB_FOUND_SYSERR;
3109 if (!memcmp(sum, found_ck->sum,
3110 sizeof(DCC_SUM))) {
3111 if (prcd_ck)
3112 *prcd_ck = found_ck;
3113 return DB_FOUND_IT;
3114 }
3115 }
3116
3117 /* This DB record was a hash collision, or for a checksum
3118 * other than our target.
3119 * Fail if this is the end of the hash chain */
3120 haddr_fwd = DB_HADDR_EX(hash_st->d.h->fwd);
3121 if (haddr_fwd == DB_HADDR_NULL)
3122 return DB_FOUND_CHAIN;
3123
3124 if (DB_HADDR_INVALID(haddr_fwd)) {
3125 dcc_pemsg(EX_DATAERR, emsg,
3126 "broken hash chain fwd-link"
3127 " #%d %#x at %#x in %s",
3128 failsafe, haddr_fwd, haddr, db_hash_nm);
3129 return DB_FOUND_SYSERR;
3130 }
3131
3132 if (!map_hash(emsg, haddr_fwd, hash_st, 0))
3133 return DB_FOUND_SYSERR;
3134
3135 haddr_bak = DB_HADDR_EX(hash_st->d.h->bak);
3136 if (haddr_bak != haddr) {
3137 dcc_pemsg(EX_DATAERR, emsg,
3138 "broken hash chain links #%d,"
3139 " %#x-->%#x but %#x<--%#x in %s",
3140 failsafe,
3141 haddr, haddr_fwd,
3142 haddr_bak, haddr_fwd,
3143 db_hash_nm);
3144 return DB_FOUND_SYSERR;
3145 }
3146 haddr = haddr_fwd;
3147 }
3148 dcc_pemsg(EX_DATAERR, emsg, "infinite hash chain at %#x in %s",
3149 haddr, db_hash_nm);
3150 return DB_FOUND_SYSERR;
3151 }
3152
3153
3154
3155 /* combine checksums */
3156 DCC_TGTS
3157 db_sum_ck(DCC_TGTS prev, /* previous sum */
3158 DCC_TGTS rcd_tgts, /* from the record */
3159 DCC_CK_TYPES type UATTRIB)
3160 {
3161 DCC_TGTS res;
3162
3163 /* This arithmetic must be commutative (after handling deleted
3164 * values), because inter-server flooding causes records to appear in
3165 * the database out of temporal order.
3166 *
3167 * DCC_TGTS_TOO_MANY can be thought of as a count of plus infinity.
3168 * DCC_TGTS_OK is like minus infinity.
3169 * DCC_TGTS_OK2 like half of minus infinity
3170 * DCC_TGTS_TOO_MANY (plus infinity) added to DCC_TGTS_OK (minus
3171 * infinity) or DCC_TGTS_OK2 yields DCC_TGTS_OK or DCC_TGTS_OK2.
3172 *
3173 * Reputations never reach infinity.
3174 *
3175 * Claims of not-spam from all clients are discarded as they arrive
3176 * and before here. They can only come from the local white list
3177 */
3178 #define SUM_OK_DEL(p,r) { \
3179 if (rcd_tgts == DCC_TGTS_OK || prev == DCC_TGTS_OK) \
3180 return DCC_TGTS_OK; \
3181 if (rcd_tgts == DCC_TGTS_OK2 || prev == DCC_TGTS_OK2) \
3182 return DCC_TGTS_OK2; \
3183 if (rcd_tgts == DCC_TGTS_DEL) \
3184 return prev; \
3185 }
3186
3187 res = prev+rcd_tgts;
3188 if (res <= DCC_TGTS_TOO_MANY)
3189 return res;
3190
3191 SUM_OK_DEL(prev, rcd_tgts);
3192 return DCC_TGTS_TOO_MANY;
3193 #undef SUM_OK_DEL
3194 }
3195
3196
3197
3198 /* delete all reports that contain the given checksum */
3199 static u_char /* 1=done, 0=broken database */
3200 del_ck(DCC_EMSG emsg,
3201 DCC_TGTS *res, /* residual targets after deletion */
3202 const DB_RCD *new, /* delete reports older than this one */
3203 DCC_CK_TYPES type, /* delete this type of checksum */
3204 DB_RCD_CK *prev_ck, /* starting with this one */
3205 DB_STATE *prev_st) /* use this scratch state block */
3206 {
3207 DB_PTR prev;
3208
3209 *res = 0;
3210 for (;;) {
3211 /* delete reports that are older than the delete request */
3212 if (dcc_ts_newer_ts(&new->ts, &prev_st->d.r->ts)
3213 && DB_RCD_ID(prev_st->d.r) != DCC_ID_WHITE) {
3214 DB_TGTS_RCD_SET(prev_st->d.r, 0);
3215 DB_TGTS_CK_SET(prev_ck, 0);
3216 SET_FLUSH_RCD(prev_st, 1);
3217
3218 } else {
3219 /* sum reports that are not deleted */
3220 *res = db_sum_ck(*res, DB_TGTS_RCD(prev_st->d.r), type);
3221 }
3222
3223 prev = DB_PTR_EX(prev_ck->prev);
3224 if (prev == DB_PTR_NULL)
3225 return 1;
3226 prev_ck = db_map_rcd_ck(emsg, prev_st, prev, type);
3227 if (!prev_ck)
3228 return 0;
3229 }
3230 }
3231
3232
3233
3234 /* see if the new and preceding records are from the same era */
3235 static inline u_char /* 1=different eras */
3236 ck_old_spam(const DB_RCD *new, const DCC_TS* prev, DCC_CK_TYPES type)
3237 {
3238 struct timeval tv;
3239 time_t secs;
3240 DCC_TS past;
3241
3242 secs = db_parms.ex_secs[type].spam;
3243 if (secs > DCC_OLD_SPAM_SECS)
3244 secs = DCC_OLD_SPAM_SECS;
3245 dcc_ts2timeval(&tv, &new->ts);
3246 dcc_timeval2ts(&past, &tv, -secs);
3247
3248 return dcc_ts_older_ts(prev, &past);
3249 }
3250
3251
3252
3253 /* Mark reports made obsolete by a spam report
3254 * A new report of spam makes sufficiently old reports obsolete.
3255 *
3256 * Sufficiently recent non-obsolete reports make a new report obsolete,
3257 * or at least not worth spending bandwidth to flood.
3258 * "Sufficiently recent" should be defined so that this server and
3259 * its downstream flooding peers always have reports of the checksums
3260 * in the report. So we want to keep (not make obsolete) at least one
3261 * report per expiration duration. We cannot know the expiration durations
3262 * of our peers, but we known DB_EXPIRE_SPAMSECS_DEF_MIN which influences
3263 * DCC_OLD_SPAM_SECS.
3264 *
3265 * However, if another checksum in the new report was kept, then
3266 * prefer marking old checksums obsolete.
3267 *
3268 * db_sts.rcd points to the new record
3269 * db_sts.rcd2 points the the previous record and is changed
3270 */
3271 static u_char /* 1=done, 0=broken database */
3272 ck_obs_spam(DCC_EMSG emsg,
3273 const DB_RCD *new,
3274 DCC_TGTS new_tgts,
3275 DB_RCD_CK *new_ck,
3276 DCC_CK_TYPES type, /* check this type of checksum */
3277 DB_RCD_CK *prev_ck, /* starting with this one */
3278 DCC_TGTS prev_ck_tgts,
3279 u_char *keeping_new) /* 1=already keeping the new record */
3280 {
3281 int limit;
3282 DB_PTR prev;
3283
3284 limit = 100;
3285 for (;;) {
3286 /* preceding white listed entries make new entries obsolete */
3287 if (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE) {
3288 new_ck->type_fgs |= DB_CK_FG_OBS;
3289 SET_FLUSH_RCD(&db_sts.rcd, 1);
3290 return 1;
3291 }
3292
3293 if (DB_CK_OBS(prev_ck)
3294 || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) {
3295 /* notice duplicates and
3296 * don't look forever for recent non-obsolete report */
3297 if (!memcmp(&new->ts, &db_sts.rcd2.d.r->ts,
3298 sizeof(new->ts))
3299 || --limit == 0) {
3300 *keeping_new = 1;
3301 return 1;
3302 }
3303
3304 } else if (prev_ck_tgts != DCC_TGTS_TOO_MANY) {
3305 /* Mark this predecessor obsolete because it
3306 * was before the checksum became spam. */
3307 prev_ck->type_fgs |= DB_CK_FG_OBS;
3308 SET_FLUSH_RCD(&db_sts.rcd2, 0);
3309
3310 /* continue backwards to mark more non-spam
3311 * predecessors obsolete */
3312
3313 } else if (!*keeping_new
3314 && ck_old_spam(new, &db_sts.rcd2.d.r->ts, type)) {
3315 /* We do not yet have a reason to keep the new report
3316 * and this predecessor is at or after a spam report.
3317 * We need the new report because it and the
3318 * predecessor are from different eras.
3319 * If the new report is not of spam, it will be
3320 * compressed with a preceding spam report. */
3321 *keeping_new = 1;
3322 /* The predecessor is not needed if the new record
3323 * is for spam */
3324 if (new_tgts == DCC_TGTS_TOO_MANY) {
3325 prev_ck->type_fgs |= DB_CK_FG_OBS;
3326 SET_FLUSH_RCD(&db_sts.rcd2, 0);
3327 }
3328 /* We're finished, because all older preceding reports
3329 * were marked obsolete when this older predecessor
3330 * was linked. */
3331 return 1;
3332
3333 } else {
3334 /* this predecessor is about as recent as the new
3335 * record, so the new record is unneeded noise that
3336 * would bloat other servers' databases. */
3337 new_ck->type_fgs |= DB_CK_FG_OBS;
3338 return 1;
3339 }
3340
3341 prev = DB_PTR_EX(prev_ck->prev);
3342 if (prev == DB_PTR_NULL) {
3343 /* the new record is a new report of spam */
3344 *keeping_new = 1;
3345 return 1;
3346 }
3347
3348 prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2, prev, type);
3349 if (!prev_ck)
3350 return 0;
3351 prev_ck_tgts = DB_TGTS_CK(prev_ck);
3352 }
3353 }
3354
3355
3356
3357 /* mark extra server-ID declarations obsolete
3358 *
3359 * db_sts.rcd points to the new record
3360 * db_sts.rcd2 points the the previous record and is changed */
3361 static u_char /* 1=done, 0=broken database */
3362 srvr_id_ck(DCC_EMSG emsg,
3363 const DB_RCD *new,
3364 DB_RCD_CK *new_ck,
3365 DB_RCD_CK *prev_ck) /* starting with this one */
3366 {
3367 DB_PTR prev;
3368 DCC_SRVR_ID new_id, prev_id;
3369 struct timeval tv;
3370 DCC_TS week_ts;
3371
3372 dcc_ts2timeval(&tv, &new->ts);
3373 tv.tv_usec = 0;
3374 tv.tv_sec -= tv.tv_sec % (7*24*60*60);
3375 dcc_timeval2ts(&week_ts, &tv, 0);
3376
3377 new_id = DB_RCD_ID(new);
3378 for (;;) {
3379 /* mark duplicate older declarations and deletions obsolete */
3380 prev_id = DB_RCD_ID(db_sts.rcd2.d.r);
3381 if (!DCC_ID_SRVR_TYPE(prev_id)
3382 || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) {
3383 if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) {
3384 new_ck->type_fgs |= DB_CK_FG_OBS;
3385 SET_FLUSH_RCD(&db_sts.rcd, 1);
3386 } else {
3387 prev_ck->type_fgs |= DB_CK_FG_OBS;
3388 SET_FLUSH_RCD(&db_sts.rcd2, 1);
3389 }
3390 return 1;
3391 }
3392
3393 /* Keep many identical type declarations as a kludge to ensure
3394 * that rewound flooding sends type declarations early.
3395 * Keep only one delcaration per week. */
3396 if (DCC_ID_SRVR_TYPE(new_id)) {
3397 /* Zap the new declaration and stop if the
3398 * new declaration is older than the predecessor. */
3399 if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) {
3400 new_ck->type_fgs |= DB_CK_FG_OBS;
3401 SET_FLUSH_RCD(&db_sts.rcd, 1);
3402 return 1;
3403 }
3404
3405 /* Stop when we find a duplicate type declaration
3406 * of a different week */
3407 if (prev_id == new_id
3408 && dcc_ts_older_ts(&db_sts.rcd2.d.r->ts,
3409 &week_ts)) {
3410 return 1;
3411 }
3412
3413 /* continue zapping preceding declarations */
3414 prev_ck->type_fgs |= DB_CK_FG_OBS;
3415 SET_FLUSH_RCD(&db_sts.rcd2, 1);
3416 }
3417
3418 prev = DB_PTR_EX(prev_ck->prev);
3419 if (prev == DB_PTR_NULL)
3420 return 1;
3421
3422 prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2,
3423 prev, DCC_CK_SRVR_ID);
3424 if (!prev_ck)
3425 return 0;
3426 }
3427 }
3428
3429
3430
3431 /* Install pointers in the hash table for a record and fix the accumulated
3432 * counts in the record pointed to by db_sts.rcd
3433 * Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp
3434 * The caller must deal with db_make_dirty() */
3435 u_char /* 0=failed, 1=done */
3436 db_link_rcd(DCC_EMSG emsg, DB_HADDR lo, DB_HADDR hi)
3437 {
3438 DCC_TGTS res;
3439 DB_RCD *rcd;
3440 DB_RCD_CK *prev_ck;
3441 DB_RCD_CK *rcd_ck;
3442 DCC_CK_TYPES rcd_type;
3443 DCC_TGTS rcd_tgts, prev_ck_tgts;
3444 int ck_num;
3445 DB_HADDR haddr;
3446 u_char keeping_new;
3447
3448 keeping_new = 0;
3449 rcd = db_sts.rcd.d.r;
3450 rcd_tgts = DB_TGTS_RCD_RAW(rcd);
3451 rcd_ck = rcd->cks;
3452 ck_num = DB_NUM_CKS(rcd);
3453 if (ck_num > DIM(rcd->cks)) {
3454 dcc_pemsg(EX_OSFILE, emsg,
3455 "bogus checksum count %#x at "L_HPAT" in %s",
3456 rcd->fgs_num_cks, db_sts.rcd.s.rptr, db_nm);
3457 return 0;
3458 }
3459 for (; ck_num > 0; --ck_num, ++rcd_ck) {
3460 rcd_type = DB_CK_TYPE(rcd_ck);
3461 if (!DCC_CK_OK_DB(grey_on, rcd_type)) {
3462 dcc_pemsg(EX_OSFILE, emsg,
3463 "invalid checksum type %s at "L_HPAT" in %s",
3464 DB_TYPE2STR(rcd_type),
3465 db_sts.rcd.s.rptr, db_nm);
3466 return 0;
3467 }
3468
3469 rcd_ck->prev = DB_PTR_CP(DB_PTR_NULL);
3470
3471 /* Do not link paths or whitelist file and line numbers */
3472 if (rcd_type == DCC_CK_FLOD_PATH) {
3473 DB_TGTS_CK_SET(rcd_ck, 0);
3474 continue;
3475 }
3476
3477 /* Do not link or total some checksums unless they are
3478 * whitelist entries. If they are whitelist entries, they
3479 * will eventually get set to DCC_TGTS_OK or DCC_TGTS_OK2.
3480 * Blacklist entries are noticed later by server-ID
3481 * or do not matter DCC_TGTS_TOO_MANY. */
3482 if (DB_TEST_NOKEEP(db_parms.nokeep_cks, rcd_type)
3483 && DB_RCD_ID(rcd) != DCC_ID_WHITE) {
3484 DB_TGTS_CK_SET(rcd_ck, 1);
3485 continue;
3486 }
3487
3488 res = (rcd_tgts == DCC_TGTS_DEL) ? 0 : rcd_tgts;
3489
3490 switch (db_lookup(emsg, rcd_type, rcd_ck->sum, lo, hi,
3491 &db_sts.hash, &db_sts.rcd2, &prev_ck)) {
3492 case DB_FOUND_SYSERR:
3493 return 0;
3494
3495 case DB_FOUND_LATER:
3496 continue;
3497
3498 case DB_FOUND_IT:
3499 /* We found the checksum
3500 * Update the hash table to point to the new record */
3501 DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
3502 SET_FLUSH_HE(&db_sts.hash);
3503 /* link new record to existing record */
3504 rcd_ck->prev = DB_PTR_CP(db_sts.rcd2.s.rptr);
3505
3506 /* delete predecessors to a delete request
3507 * and compute the remaining sum */
3508 if (rcd_tgts == DCC_TGTS_DEL) {
3509 if (!del_ck(emsg, &res, rcd, rcd_type,
3510 prev_ck, &db_sts.rcd2))
3511 return 0;
3512 /* delete requests are obsolete if the
3513 * checksum is whitelisted */
3514 if (res == DCC_TGTS_OK
3515 || res == DCC_TGTS_OK2)
3516 rcd_ck->type_fgs |= DB_CK_FG_OBS;
3517 break;
3518 }
3519
3520 /* Simple checksum with a predecessor
3521 * This does not do the substantial extra work
3522 * to notice all delete requests that arrived early.
3523 * That problem is handled by the incoming flood
3524 * duplicate report detection mechanism.
3525 * We must detect precessors that were deleted because
3526 * they are partial duplicates of the new record. */
3527 prev_ck_tgts = DB_TGTS_CK(prev_ck);
3528 if (DB_RCD_SUMRY(rcd))
3529 res = prev_ck_tgts;
3530 else
3531 res = db_sum_ck(prev_ck_tgts, res, rcd_type);
3532 if ((res == DCC_TGTS_OK || res == DCC_TGTS_OK2
3533 || (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE))
3534 && DB_RCD_ID(rcd) != DCC_ID_WHITE){
3535 /* obsolete whitelisted checksums */
3536 rcd_ck->type_fgs |= DB_CK_FG_OBS;
3537 break;
3538 }
3539 if (res == DCC_TGTS_TOO_MANY) {
3540 /* mark obsolete unneeded reports of spam */
3541 if (!DB_CK_OBS(rcd_ck)
3542 && !ck_obs_spam(emsg, rcd, rcd_tgts,
3543 rcd_ck, rcd_type,
3544 prev_ck, prev_ck_tgts,
3545 &keeping_new))
3546 return 0; /* (broken database) */
3547 } else if (rcd_type == DCC_CK_SRVR_ID) {
3548 /* mark obsolete server-ID assertions */
3549 if (!DB_CK_OBS(rcd_ck)
3550 && !srvr_id_ck(emsg, rcd, rcd_ck, prev_ck))
3551 return 0; /* (broken database) */
3552 }
3553 break;
3554
3555 case DB_FOUND_EMPTY:
3556 /* We found an empty hash table slot.
3557 * Update the slot to point to our new record
3558 * after removing it from the free list,
3559 * which marks it dirty. */
3560 if (!unlink_free_hash(emsg, &db_sts.hash))
3561 return 0;
3562 DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
3563 HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
3564 if (res >= BULK_THRESHOLD)
3565 keeping_new = 1;
3566 break;
3567
3568 case DB_FOUND_CHAIN:
3569 /* We found a hash collision, a chain of 1 or more
3570 * records with the same hash value.
3571 * Get a free slot, link it to the end of the
3572 * existing chain, and point it to the new record.
3573 * The buffer containing the free slot is marked
3574 * dirty when it is removed from the free list. */
3575 if (!get_free_hash(emsg, db_sts.hash.s.haddr))
3576 return 0;
3577 DB_HADDR_CP(db_sts.free.d.h->bak, db_sts.hash.s.haddr);
3578 DB_HADDR_CP(db_sts.hash.d.h->fwd, db_sts.free.s.haddr);
3579 DB_HPTR_CP(db_sts.free.d.h->rcd, db_sts.rcd.s.rptr);
3580 HE_MERGE(db_sts.free.d.h,rcd_type, rcd_ck->sum);
3581 SET_FLUSH_HE(&db_sts.hash);
3582 if (res >= BULK_THRESHOLD)
3583 keeping_new = 1;
3584 break;
3585
3586 case DB_FOUND_INTRUDER:
3587 /* The home hash slot for our key contains an
3588 * intruder. Move it to a new free slot */
3589 if (!get_free_hash(emsg, db_sts.hash.s.haddr))
3590 return 0;
3591 *db_sts.free.d.h = *db_sts.hash.d.h;
3592 /* re-link the neighbors of the intruder */
3593 haddr = DB_HADDR_EX(db_sts.free.d.h->bak);
3594 if (haddr == DB_HADDR_NULL) {
3595 dcc_pemsg(EX_DATAERR, emsg,
3596 "bad hash chain reverse link at %#x"
3597 " in %s",
3598 haddr, db_hash_nm);
3599 return 0;
3600 }
3601 if (!map_hash(emsg, haddr, &db_sts.tmp, 0))
3602 return 0;
3603 DB_HADDR_CP(db_sts.tmp.d.h->fwd, db_sts.free.s.haddr);
3604 SET_FLUSH_HE(&db_sts.tmp);
3605 haddr = DB_HADDR_EX(db_sts.hash.d.h->fwd);
3606 if (haddr != DB_HADDR_NULL) {
3607 if (!map_hash(emsg, haddr, &db_sts.tmp, 0))
3608 return 0;
3609 DB_HADDR_CP(db_sts.tmp.d.h->bak,
3610 db_sts.free.s.haddr);
3611 SET_FLUSH_HE(&db_sts.tmp);
3612 }
3613 /* install the new entry in its home slot */
3614 DB_HADDR_CP(db_sts.hash.d.h->fwd, DB_HADDR_NULL);
3615 DB_HADDR_CP(db_sts.hash.d.h->bak, DB_HADDR_NULL);
3616 DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
3617 HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
3618 SET_FLUSH_HE(&db_sts.hash);
3619 if (res >= BULK_THRESHOLD)
3620 keeping_new = 1;
3621 break;
3622 }
3623
3624 /* Fix the checksum's total in the record */
3625 DB_TGTS_CK_SET(rcd_ck, res);
3626 SET_FLUSH_RCD(&db_sts.rcd, 0);
3627 }
3628
3629 return db_set_sizes(emsg);
3630 }
3631
3632
3633
3634 /* Add a record to the database and the hash table
3635 * The record must be known to be valid
3636 * Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp
3637 * On exit db_sts.rcd points to the new record in the database */
3638 DB_PTR /* 0=failed */
3639 db_add_rcd(DCC_EMSG emsg, const DB_RCD *new_rcd)
3640 {
3641 u_int new_rcd_len, pad_len;
3642 DB_PTR new_db_csize, rcd_pos, new_page_num;
3643 DB_BUF *b;
3644
3645 if (!db_make_dirty(emsg))
3646 return 0;
3647
3648 new_rcd_len = (sizeof(*new_rcd)
3649 - sizeof(new_rcd->cks)
3650 + (DB_NUM_CKS(new_rcd) * sizeof(new_rcd->cks[0])));
3651
3652 rcd_pos = db_csize;
3653 new_db_csize = rcd_pos+new_rcd_len;
3654
3655 new_page_num = DB_PTR2PG_NUM(new_db_csize, db_pagesize);
3656 if (new_page_num == DB_PTR2PG_NUM(db_csize, db_pagesize)) {
3657 if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0))
3658 return 0;
3659
3660 } else {
3661 /* fill with zeros to get past a page boundary. */
3662 pad_len = new_page_num*db_pagesize - db_csize;
3663 pad_len = (((pad_len + DB_RCD_HDR_LEN-1) / DB_RCD_HDR_LEN)
3664 * DB_RCD_HDR_LEN);
3665 if (pad_len != 0) {
3666 if (!map_db(emsg, db_csize, pad_len, &db_sts.rcd, 0))
3667 return 0;
3668 memset(db_sts.rcd.d.r, 0, pad_len);
3669 db_set_flush(&db_sts.rcd, 1, pad_len);
3670 db_csize += pad_len;
3671
3672 rcd_pos = db_csize;
3673 new_db_csize = rcd_pos+new_rcd_len;
3674 }
3675
3676 /* extend the file by writing a full page to it with write(),
3677 * because extending by mmap() often does not work */
3678 db_fsize = db_csize+db_pagesize;
3679 if (!map_db(emsg, rcd_pos, db_pagesize, &db_sts.rcd, 1))
3680 return 0;
3681 b = db_sts.rcd.b;
3682 b->flush = (DB_BUF_FM)-1;
3683
3684 /* push new page to disk if dblist or dbclean is running */
3685 if (db_minimum_map) {
3686 rel_db_state(&db_sts.rcd);
3687 if (!buf_munmap(emsg, b))
3688 return 0;
3689 if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0))
3690 return 0;
3691 }
3692 }
3693
3694 /* install the record */
3695 memcpy(db_sts.rcd.d.r, new_rcd, new_rcd_len);
3696 /* Mark its buffer to be sent to the disk to keep the database
3697 * as good as possible even if we crash. We don't need to worry
3698 * about later changes to the hash links because dbclean will
3699 * rebuild them if we crash */
3700 db_set_flush(&db_sts.rcd, 1, new_rcd_len);
3701 db_csize = new_db_csize;
3702
3703 /* install pointers in the hash table
3704 * and update the total counts in the record */
3705 if (!db_link_rcd(emsg, 0, MAX_HASH_ENTRIES))
3706 return 0;
3707
3708 ++db_stats.adds;
3709 return rcd_pos;
3710 }