Mercurial > notdcc
diff srvrlib/db.c @ 0:c7f6b056b673
First import of vendor version
author | Peter Gervai <grin@grin.hu> |
---|---|
date | Tue, 10 Mar 2009 13:49:58 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srvrlib/db.c Tue Mar 10 13:49:58 2009 +0100 @@ -0,0 +1,3710 @@ +/* Distributed Checksum Clearinghouse + * + * server database functions + * + * Copyright (c) 2008 by Rhyolite Software, LLC + * + * This agreement is not applicable to any entity which sells anti-spam + * solutions to others or provides an anti-spam solution as part of a + * security solution sold to other entities, or to a private network + * which employs the DCC or uses data provided by operation of the DCC + * but does not provide corresponding data to other users. + * + * Permission to use, copy, modify, and distribute this software without + * changes for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear in all + * copies and any distributed versions or copies are either unchanged + * or not called anything similar to "DCC" or "Distributed Checksum + * Clearinghouse". + * + * Parties not eligible to receive a license under this agreement can + * obtain a commercial license to use DCC by contacting Rhyolite Software + * at sales@rhyolite.com. + * + * A commercial license would be for Distributed Checksum and Reputation + * Clearinghouse software. That software includes additional features. This + * free license for Distributed ChecksumClearinghouse Software does not in any + * way grant permision to use Distributed Checksum and Reputation Clearinghouse + * software + * + * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC + * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES + * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Rhyolite Software DCC 1.3.103-1.214 $Revision$ + */ + +#include "srvr_defs.h" +#include <syslog.h> +#include <sys/resource.h> +#if defined(HAVE_HW_PHYSMEM) || defined(HAVE_BOOTTIME) +#include <sys/sysctl.h> +#endif +#ifdef HAVE_PSTAT_GETSTATIC /* HP-UX */ +#include <sys/pstat.h> +#endif + +DB_STATS db_stats; + +DB_STATES db_sts; + +DCC_PATH db_path_buf; + +int db_fd = -1; +DCC_PATH db_nm; +int db_hash_fd = -1; +DCC_PATH db_hash_nm; +struct timeval db_locked; /* 1=database not locked */ + +struct timeval db_time; + +int db_debug; + +u_char grey_on; +static u_char db_use_write; /* 0=no 1=if RAM big enough 2=always */ +static u_char db_dirty; +static u_char db_rdonly; +int db_failed_line; /* bad happened at this line # */ +const char *db_failed_file; /* in this file */ +static u_char db_invalidate; /* do not write to the files */ + +/* Without mmap(MAP_NOSYNC) as on Solaris or a good msync() as on BSD/OS, + * we must rely on the kernel's update/syncer/bufdaemon/etc. So in this + * case just fondle the mmap()'ed pages and hope things work out. + * + * With a msync() and with mmap(MAP_NOSYNC), use MAP_NOSYNC if we can because + * some systems flush too quickly while others such as FreeBSD 6.1 stall + * for seconds while thinking about flushing the database. + * But with mmap(MAP_NOSYNC) we leave large amounts of data in RAM that take + * too long time to be pushed to the disk when the system is shutting down. + * So + * - hit only those chunks of memory with real data or changes to data + * with msync(). Trust dbclean to rebuild everything else at need. + * + * - when it seems the system is being shut down, delete the hash table + * and let it be rebuilt when the system is rebooted. When the + * hash table is rebuilt, "obsolete" markings in the data file that + * might have been lost will be remade. + * + * A third case involves dccd -F. It requires that all changes be pushed to + * the disk whenever dccd unlocks the database so that dbclean can see changes + * dccd makes. It also requires that dbclean write all of its changes so + * that dccd will find them when it reopens the database. + */ + +#if !defined(MAP_NOSYNC) || defined(HAVE_OLD_MSYNC) || !defined(HAVE_BOOTTIME) +#undef USE_MAP_NOSYNC +#else +#define USE_MAP_NOSYNC +#endif + +static u_char db_not_synced; /* database unsynchronized with disk */ + + +#define DCC_MADV_WILLNEED(p) 0 +#ifdef MADV_WILLNEED +#undef DCC_MADV_WILLNEED +#define DCC_MADV_WILLNEED(p) madvise(p, db_pagesize, MADV_WILLNEED) +#endif +#ifdef POSIX_MADV_WILLNEED +#undef DCC_MADV_WILLNEED +#define DCC_MADV_WILLNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_WILLNEED) +#endif + +#define DCC_MADV_RANDOM(p) 0 +#ifdef MADV_RANDOM +#undef DCC_MADV_RANDOM +#define DCC_MADV_RANDOM(p) madvise(p, db_pagesize, MADV_RANDOM) +#endif +#ifdef POSIX_MADV_RANDOM +#undef DCC_MADV_RANDOM +#define DCC_MADV_RANDOM(p) posix_madvise(p, db_pagesize, POSIX_MADV_RANDOM) +#endif + +#define DCC_MADV_DONTNEED(p) 0 +/* The Linux people claim that it is just fine that their notion of + * MADV_DONTNEED implies discarding changes to data. Worse, some versions of + * Linux/GNU libc define POSIX_MADV_DONTNEED as the data-corrupting Linux + * MADV_DONTNEED. This seems to be because they cannot admit their mistake of + * not distinguishing between the functions of MADV_FREE and MADV_DONTNEED and + * their misreading of other systems' documentation for MADV_DONTNEED */ +#ifndef linux +#ifdef MADV_DONTNEED +#undef DCC_MADV_DONTNEED +#define DCC_MADV_DONTNEED(p) madvise(p, db_pagesize, MADV_DONTNEED) +#endif +#ifdef POSIX_MADV_DONTNEED +#undef DCC_MADV_DONTNEED +#define DCC_MADV_DONTNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_DONTNEED) +#endif +#endif /* !linux */ + +#define DCC_MADV_FREE(p) 0 +#ifdef MADV_FREE +#undef DCC_MADV_FREE +#define DCC_MADV_FREE(p) madvise(p, db_pagesize, MADV_FREE) +#endif +#ifdef POSIX_MADV_FREE +#undef DCC_MADV_FREE +#define DCC_MADV_FREE(p) posix_madvise(p, db_pagesize, POSIX_MADV_FREE) +#endif + + +u_char db_minimum_map; /* this is dccd & dbclean is running */ + +int db_buf_total; /* total # of db buffers */ +DB_PTR db_max_rss; /* maximum db resident set size */ +DB_PTR db_max_byte; /* maximum db bytes in both files */ + +static u_int system_pagesize; /* kernel page size */ + +static DB_BUF db_bufs[DB_BUF_MAX]; /* control mmap()'ed blocks */ +static DB_BUF *buf_oldest, *buf_newest; + +#define DB_HASH_TOTAL DB_BUF_MAX +static DB_BUF *db_buf_hash[DB_HASH_TOTAL]; +/* fancy 16-bit multiplicative hash assumes multiplication needs 1 cycle + * and so the hash is faster than dealing with a collision */ +#define DB_BUF_HASH(pnum,t) (&db_buf_hash[((((pnum)*(t)*0x9ccf) & 0xffff) \ + * DB_BUF_MAX) >> 16]) + +time_t db_need_flush_secs; +static time_t db_urgent_need_flush_secs; + +const DB_VERSION_BUF db_version_buf = DB_VERSION_STR; +DB_PARMS db_parms; +static DB_PARMS db_parms_stored; + +DCC_TGTS db_tholds[DCC_DIM_CKS]; + +u_int db_pagesize; /* size of 1 mmap()'ed buffer */ +static u_int db_pagesize_part; + +DB_HOFF db_hash_fsize; /* size of hash table file */ +static u_int hash_clear_pg_num; +DB_HADDR db_hash_len; /* # of hash table entries */ +DB_HADDR db_hash_divisor; /* modulus */ +DB_HADDR db_hash_used; /* # of hash table entries in use */ +u_int db_hash_page_len; /* # of HASH_ENTRY's per buffer */ +DB_HADDR db_max_hash_entries = 0; /* after db_buf_init()*/ +DB_PTR db_fsize; /* size of database file */ +DB_PTR db_csize; /* size of database contents in bytes */ +static DB_PTR db_csize_stored_hash; /* DB size stored in hash file */ +static DB_HADDR db_hash_used_stored_hash; +u_int db_page_max; /* only padding after this in DB buf */ +static DB_PTR db_window_size; /* size of mmap() window */ +char db_window_size_str[128]; +static char db_physmem_str[80]; + +static const u_char dcc_ck_fuzziness[DCC_DIM_CKS] = { + 0, /* DCC_CK_INVALID */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_IP */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_ENV_FROM */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_FROM */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_SUB */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_MESSAGE_ID */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_RECEIVED */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_BODY */ + DCC_CK_FUZ_LVL1, /* DCC_CK_FUZ1 */ + DCC_CK_FUZ_LVL2, /* DCC_CK_FUZ2 */ + DCC_CK_FUZ_LVL_REP, /* DCC_CK_REP_TOTAL */ + DCC_CK_FUZ_LVL_REP, /* DCC_CK_REP_BULK */ + DCC_CK_FUZ_LVL2, /* DCC_CK_SRVR_ID */ + DCC_CK_FUZ_LVL2 /* DCC_CK_ENV_TO */ +}; +static const u_char grey_ck_fuzziness[DCC_DIM_CKS] = { + 0, /* DCC_CK_INVALID */ + DCC_CK_FUZ_LVL2, /* DCC_CK_IP */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_ENV_FROM */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_FROM */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_SUB */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_MESSAGE_ID */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_RECEIVED */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_BODY */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_FUZ1 */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_FUZ2 */ + DCC_CK_FUZ_LVL_NO, /* DCC_CK_GREY_MSG */ + DCC_CK_FUZ_LVL1, /* DCC_CK_GREY_TRIPLE */ + DCC_CK_FUZ_LVL1, /* DCC_CK_SRVR_ID */ + DCC_CK_FUZ_LVL1 /* DCC_CK_ENV_TO */ +}; +const u_char *db_ck_fuzziness = dcc_ck_fuzziness; + + +static u_char buf_flush(DCC_EMSG, DB_BUF *, u_char); +static u_char buf_munmap(DCC_EMSG, DB_BUF *); +static DB_BUF *find_buf(DCC_EMSG, DB_BUF_TYPE, DB_PG_NUM); +static u_char map_hash(DCC_EMSG, DB_HADDR, DB_STATE *, u_char); +static u_char map_hash_ctl(DCC_EMSG, u_char); +static u_char map_db(DCC_EMSG, DB_PTR, u_int, DB_STATE *, u_char); +static u_char db_set_sizes(DCC_EMSG); + + +/* compute the least common multiple of two numbers */ +static u_int +lcm(u_int n, u_int m) +{ + u_int r, x, gcd; + + /* first get the gcd of the two numbers */ + if (n >= m) { + x = n; + gcd = m; + } else { + x = m; + gcd = n; + } + for (;;) { + r = x % gcd; + if (r == 0) + return n * (m / gcd); + x = gcd; + gcd = r; + } +} + + + +const char * +db_ptr2str(DB_PTR val) +{ + static int bufno; + static struct { + char str[16]; + } bufs[4]; + char *s; + const char *units; + + if (val == 0) + return "0"; + + s = bufs[bufno].str; + bufno = (bufno+1) % DIM(bufs); + + if (val % (1024*1024*1024) == 0) { + val /= (1024*1024*1024); + units = "GB"; + } else if (val % (1024*1024) == 0) { + val /= (1024*1024); + units = "MB"; + } else if (val % 1024 == 0) { + val /= 1024; + units = "KB"; + } else { + units = ""; + } + if (val > 1000*1000*1000) + snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d,%03d%s", + (int)(val / (1000*1000*1000)), + (int)(val / (1000*1000)) % 1000, + (int)(val / 1000) % 1000, + (int)(val % 1000), + units); + else if (val > 1000*1000) + snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d%s", + (int)(val / (1000*1000)), + (int)(val / 1000) % 1000, + (int)(val % 1000), + units); + else if (val > 1000*10) + snprintf(s, sizeof(bufs[0].str), "%d,%03d%s", + (int)(val / 1000), + (int)(val % 1000), + units); + else + snprintf(s, sizeof(bufs[0].str), "%d%s", + (int)val, + units); + return s; +} + + + +const char * +size2str(char *buf, u_int buf_len, + double num, u_char bytes_or_entries) /* 0=number 1=bytes */ +{ + const char *units; + double k; + + k = bytes_or_entries ? 1024.0 : 1000.0; + + if (num < k) { + units = ""; + } else if (num < k*k) { + num /= k; + units = "K"; + } else if (num < k*k*k) { + num /= k*k; + units = "M"; + } else { + num /= k*k*k; + units = "G"; + } + + if ((int)num >= 100) + snprintf(buf, buf_len, "%.0f%s", num, units); + else + snprintf(buf, buf_len, "%.2g%s", num, units); + return buf; +} + + + +void PATTRIB(5,6) +db_failure(int linenum, const char *file, int ex_code, DCC_EMSG emsg, + const char *p, ...) +{ + va_list args; + + if (!db_failed_line) { + db_failed_line = linenum; + db_failed_file = file; + } + va_start(args, p); + dcc_vpemsg(ex_code, emsg, p, args); + va_end(args); +} + + + +void PATTRIB(3,4) +db_error_msg(int linenum, const char *file, const char *p, ...) +{ + va_list args; + + if (!db_failed_line) { + db_failed_line = linenum; + db_failed_file = file; + } + va_start(args, p); + dcc_verror_msg(p, args); + va_end(args); +} + + + +double /* hashes or bytes/second */ +db_add_rate(const DB_PARMS *parms, + u_char hash_or_db) /* 1=hash */ +{ + struct timeval sn; + time_t new_rate_secs; + time_t total_secs; + double added, cur, prev; + + total_secs = parms->rate_secs; + if (hash_or_db) { + added = parms->hash_added; + cur = parms->hash_used; + prev = parms->old_hash_used; + } else { + added = parms->db_added; + cur = parms->db_csize; + prev = parms->old_db_csize; + } + + if (total_secs <= 0 || total_secs > DB_MAX_RATE_SECS + || added <= 0.0) { + added = 0.0; + total_secs = 0; + } + + dcc_ts2timeval(&sn, &parms->sn); + new_rate_secs = parms->last_rate_sec - sn.tv_sec; + if (new_rate_secs > 0 && new_rate_secs <= DB_MAX_RATE_SECS + && cur > prev) { + total_secs += new_rate_secs; + added += cur - prev; + } + + if (total_secs <= DB_MIN_RATE_SECS) + return -1.0; + return added / total_secs; +} + + + +DB_NOKEEP_CKS +def_nokeep_cks(void) +{ + DCC_CK_TYPES type; + DB_NOKEEP_CKS nokeep = 0; + + for (type = DCC_CK_TYPE_FIRST; type <= DCC_CK_TYPE_LAST; ++type) { + if (DB_GLOBAL_NOKEEP(grey_on, type)) + DB_SET_NOKEEP(nokeep, type); + } + DB_SET_NOKEEP(nokeep, DCC_CK_INVALID); + DB_SET_NOKEEP(nokeep, DCC_CK_FLOD_PATH); + + return nokeep; +} + + + +void +set_db_tholds(DB_NOKEEP_CKS nokeep) +{ + DCC_CK_TYPES type; + + for (type = 0; type < DIM(db_tholds); ++type) { + db_tholds[type] = (DB_TEST_NOKEEP(nokeep, type) + ? DCC_TGTS_INVALID + : DCC_CK_IS_REP_CMN(grey_on, type) + ? DCC_TGTS_INVALID + : grey_on ? 1 + : type == DCC_CK_SRVR_ID ? 1 + : BULK_THRESHOLD); + } +} + + + +static const char * +buf2path(const DB_BUF *b) +{ + switch (b->buf_type) { + case DB_BUF_TYPE_HASH: + return db_hash_nm; + case DB_BUF_TYPE_DB: + return db_nm; + case DB_BUF_TYPE_FREE: + default: + dcc_logbad(EX_SOFTWARE, "impossible buffer type for a path"); + } +} + + + +static int +buf2fd(const DB_BUF *b) +{ + switch (b->buf_type) { + case DB_BUF_TYPE_HASH: + return db_hash_fd; + case DB_BUF_TYPE_DB: + return db_fd; + case DB_BUF_TYPE_FREE: + default: + dcc_logbad(EX_SOFTWARE, "impossible buffer type for fd"); + } +} + + + +static void +rel_db_state(DB_STATE *st) +{ + DB_BUF *b; + + b = st->b; + if (!b) + return; + st->b = 0; + st->d.v = 0; + st->s.rptr = DB_PTR_BAD; + if (--b->lock_cnt < 0) + dcc_logbad(EX_SOFTWARE,"negative database buffer lock"); +} + + + +void +rel_db_states(void) +{ + DB_STATE *st; + + for (st = &db_sts.rcd; st <= &db_sts.hash_ctl; ++st) { + rel_db_state(st); + } +} + + + +/* release one or all unneeded buffers */ +u_char /* 0=problem 1=did nothing 2=did>=1 */ +db_unload(DCC_EMSG emsg, + u_char some) /* 0=all, 1=only one, 2=finished */ +{ + DB_BUF *b; + u_char result; + + result = 1; + for (b = buf_oldest; b != 0; b = b->newer) { + if (b->buf_type == DB_BUF_TYPE_FREE + || b->lock_cnt != 0) + continue; + if (some == 2 + && !(b->flags & DB_BUF_FG_USE_WRITE) + && 0 > DCC_MADV_DONTNEED(b->buf.v)) + dcc_error_msg("madvise(DONTNEED %s,%#x): %s", + buf2path(b), db_pagesize, ERROR_STR()); + if (!buf_munmap(emsg, b)) { + emsg = 0; + result = 0; + } else if (result) { + result = 2; + } + if (some == 1) + return result; + } + + return result; +} + + + +static u_char +buf_write_part(DCC_EMSG emsg, DB_BUF *b, off_t offset, void *buf, int len) +{ + int i; + + offset += (off_t)b->pg_num * (off_t)db_pagesize; + + if (offset != lseek(buf2fd(b), offset, SEEK_SET)) { + db_failure(__LINE__,__FILE__, EX_IOERR, emsg, + "buf_write_part lseek(%s,"OFF_HPAT"): %s", + buf2path(b), offset, ERROR_STR()); + return 0; + } + i = write(buf2fd(b), buf, len); + if (i != len) { + db_failure(__LINE__,__FILE__, EX_IOERR, emsg, + "buf_write_part(%s,%u)=%d: %s", + buf2path(b), len, i, ERROR_STR()); + return 0; + } + + return 1; +} + + + +/* push part of a buffer toward the disk + * this can be needed even when the file has been opened and mapped + * read-only by dbclean */ +static u_char +buf_flush_part(DCC_EMSG emsg, DB_BUF *b, + u_int part, /* DB_BUF_NUM_PARTS=buffer */ + u_char async UATTRIB) +{ + u_int flush_len; + char *flush_base; + DB_BUF_FM bit; + + bit = PART2BIT(part) & (b->flush | b->flush_urgent); + if (!bit) + return 1; + + /* Send a new buffer to disk at once. */ + if (b->flags & DB_BUF_FG_EXTENSION) { + DB_BUF *b1, *b0; + u_char result; + + /* To give the file system a chance to make the hash table + * contiguous, first write all preceding new buffers. + * In almost all cases, there will be none. */ + result = 1; + do { + b0 = b; + for (b1 = buf_oldest; b1 != 0; b1 = b1->newer) { + if (!(b1->flags & DB_BUF_FG_EXTENSION) + || b1->buf_type != b0->buf_type + || b1->pg_num >= b0->pg_num) + continue; + b0 = b1; + } + b0->flags &= ~DB_BUF_FG_EXTENSION; + b0->flush = 0; + b0->flush_urgent = 0; + if (!db_invalidate + && !buf_write_part(emsg, b0, + 0, b0->buf.c, db_pagesize)) + result = 0; + } while (b0 != b); + return result; + } + + flush_base = b->ranges[part].lo; + flush_len = b->ranges[part].hi - flush_base; + b->flush &= ~bit; + b->flush_urgent &= ~bit; + + if (db_invalidate) + return 1; + + if (b->flags & DB_BUF_FG_USE_WRITE) { + static char *wbuf; + static u_int wbuf_len; + + /* In at least FreeBSD you cannot write() to the file + * that underlies a mmap() region from that region */ + if (wbuf_len < db_pagesize_part) { + /* the page size for the current file + * might be different from the old file */ + if (wbuf) + free(wbuf); + wbuf_len = db_pagesize_part; + wbuf = malloc(wbuf_len); + } + + memcpy(wbuf, flush_base, flush_len); + return buf_write_part(emsg, b, flush_base - b->buf.c, + wbuf, flush_len); + +#ifndef HAVE_OLD_MSYNC + } else if (async) { + if (0 > MSYNC(flush_base, flush_len, MS_ASYNC)) { + db_failure(__LINE__,__FILE__, EX_IOERR, emsg, + "msync(db buffer %s,%#lx,%#x,MS_ASYNC): %s", + buf2path(b), (long)flush_base, flush_len, + ERROR_STR()); + return 0; + } +#endif + } else { + if (0 > MSYNC(flush_base, flush_len, MS_SYNC)) { + db_failure(__LINE__,__FILE__, EX_IOERR, emsg, + "msync(db buffer %s,%#lx,%#x,MS_SYNC): %s", + buf2path(b), (long)flush_base, flush_len, + ERROR_STR()); + return 0; + } + } + + return 1; +} + + + +static u_char +buf_flush(DCC_EMSG emsg, DB_BUF *b, u_char async) +{ + u_int part; + DB_BUF_FM bits; + u_char result = 1; + + bits = b->flush_urgent | b->flush; + for (part = 0; bits != 0 && part < DB_BUF_NUM_PARTS; ++part) { + if (bits & PART2BIT(part)) { + if (!buf_flush_part(emsg, b, part, async)) { + emsg = 0; + result = 0; + } + bits = b->flush_urgent | b->flush; + } + } + return result; +} + + + +/* Try to keep the data clean so that the fsync() required by Solaris + * when the file is unloaded is not too expensive. + * Try to flush frequently so that we don't stall as long in msync(). + */ +void +db_flush_needed(void) +{ + static DB_BUF *next_b = db_bufs; + static u_int next_part; + DB_BUF *b; + u_int part, all_parts; + int buf_num; + u_char worked; + + /* send to the disk changes that cannot be recreated by dbclean */ + if (db_urgent_need_flush_secs != 0 + && DB_IS_TIME(db_urgent_need_flush_secs, + DB_URGENT_NEED_FLUSH_SECS)) { + worked = 0; + for (b = buf_newest; b; b = b->older) { + if (b->buf_type == DB_BUF_TYPE_FREE) + continue; + + for (part = 0; + b->flush_urgent != 0 && part < DB_BUF_NUM_PARTS; + ++part) { + if ((b->flush_urgent & PART2BIT(part))) { + buf_flush_part(0, b, part, 1); + worked = 1; + } + } + + /* Switch new data pages to mmap() + * when this is not dbclean, since only dccd calls here + * they are not using mmap() + * they are either hash table pages or + * not the last page in the file */ + if ((b->flags & DB_BUF_FG_USE_WRITE) + && !db_use_write + && (b->buf_type != DB_BUF_TYPE_DB + || (DB_PTR2PG_NUM(db_csize-1, db_pagesize) + != b->pg_num))) { + if (b->lock_cnt != 0) + rel_db_states(); + buf_munmap(0, b); + } + } + + /* Keep the clock running if we did any work. This tends to + * avoid stalls caused by colliding with the FreeBSD syncer */ + if (worked) { + gettimeofday(&db_time, 0); + db_urgent_need_flush_secs = (db_time.tv_sec + + DB_URGENT_NEED_FLUSH_SECS); + } else { + db_urgent_need_flush_secs = 0; + } + } + + /* assume there will be nothing more to do */ + db_need_flush_secs = db_urgent_need_flush_secs; + +#ifdef USE_MAP_NOSYNC + /* if we are using mmap(MAP_NOSYNC), then there are no bits + * set in any b->flush words except that of the recent + * DB_BUF_FG_USE_WRITE extensions of the file. It is best to let + * those blocks stay in RAM until the whole buffer is flushed and + * switched to mmap above */ + if (!db_use_write) + return; +#endif + + b = next_b; + part = next_part; + all_parts = DB_PARTS_PER_FLUSH; + for (buf_num = DIM(db_bufs); buf_num >= 0; --buf_num) { + if (b > LAST(db_bufs)) { + part = 0; + b = db_bufs; + } + if (!b->flush + || part >= DB_BUF_NUM_PARTS + || b->buf_type == DB_BUF_TYPE_FREE) { + part = 0; + ++b; + continue; + } + + while (part < DB_BUF_NUM_PARTS) { + if (b->flush & PART2BIT(part)) { + buf_flush_part(0, b, part, 1); + if (--all_parts == 0) { + next_part = part+1; + next_b = b; + db_need_flush_secs = (db_time.tv_sec + + DB_NEED_FLUSH_SECS); + return; + } + if (!b->flush) + part = DB_BUF_NUM_PARTS; + } + ++part; + } + } +} + + + +/* occassionally flush an unlocked data buffer for dbclean + * dbclean mostly changes only the current record, so get started + * writing the data to avoid stalling the system at the end. */ +u_char +db_flush_db(DCC_EMSG emsg UATTRIB) +{ +#ifdef USE_MAP_NOSYNC + DB_BUF *b; + int limit; + int pg_num; + + /* Gently push the new hash table to disk. + * The disk image will never be accurate. This only allocates space. + * Do not do this for systems that lack mmap(NOSYNC) such as Linux + * that thrash themselves as the hash table is being built. A + * long pause when the database is closed is not as bad as spending + * hours building the hash table. */ + while (hash_clear_pg_num < db_hash_fsize/db_hash_page_len) { + pg_num = hash_clear_pg_num++; + for (b = buf_oldest; b != 0; b = b->newer) { + if (b->pg_num != pg_num + || b->buf_type != DB_BUF_TYPE_HASH) + continue; + if (!(b->flags & DB_BUF_FG_EXTENSION)) + break; + if (b->lock_cnt != 0) + rel_db_states(); + return buf_munmap(emsg, b); + } + + /* look for the next page if this one has already + * been flushed */ + } + + /* flush some ordinary buffers */ + limit = 2; + for (b = buf_oldest; b != 0; b = b->newer) { + if (b->flush_urgent == 0 + || b->buf_type == DB_BUF_TYPE_FREE + || b->lock_cnt != 0) + continue; + if (!buf_flush(emsg, b, 1)) + return 0; + if (--limit <= 0) + return 1; + } +#endif + return 1; +} + + + +/* mark part of a buffer dirty + * "Urgent" changes are flushed by a timer. Ordinary changes + * are often ignored and expected to be rebuilt if the system crashes. + * That the hash table is deleted as the system is shut down while the + * database must be flushed from the system's buffer cache is a reason + * to keep the disk image of the database good. */ +void +db_set_flush(DB_STATE *st, u_char urgent, u_int len) +{ + DB_BUF *b; + DB_BUF_FM bit, new_bits, old_bits; + char *buf_base, *part_end, *start, *end; + u_int part, i; + + /* nothing to do if the kernel is handling it + * or if we are letting this change be reconstructed by dbclean */ + b = st->b; + if (!(b->flags & DB_BUF_FG_USE_WRITE)) { +#ifdef USE_MAP_NOSYNC + if (!urgent) +#endif + return; + } + + start = st->d.c; + buf_base = b->buf.c; + + /* Increase to even pages in the hope that the file system might + * be able to page-flip. This might at least avoid reading into the + * buffer cache to honor a write(). Besides, Solaris' msync() handles + * only even pages. */ + i = (start - buf_base) % system_pagesize; + start -= i; + len += i; + len = ((len + system_pagesize-1) / system_pagesize) * system_pagesize; + + end = start + len; + if (end > buf_base+db_pagesize) + dcc_logbad(EX_SOFTWARE, "inflated dirty buffer size"); + + part = (start - buf_base) / db_pagesize_part; + part_end = buf_base + part * db_pagesize_part; + bit = PART2BIT(part); + new_bits = 0; + old_bits = b->flush | b->flush_urgent; + do { + part_end += db_pagesize_part; + if (part_end > end) + part_end = end; + + if (!(old_bits & bit)) { + b->ranges[part].lo = start; + b->ranges[part].hi = part_end; + } else { + if (b->ranges[part].lo > start) + b->ranges[part].lo = start; + if (b->ranges[part].hi < part_end) + b->ranges[part].hi = part_end; + } + new_bits |= bit; + + start = part_end; + bit <<= 1; + ++part; + } while (part_end < end); + + if (urgent) { + b->flush_urgent |= new_bits; + if (!db_urgent_need_flush_secs) { + db_urgent_need_flush_secs = (db_time.tv_sec + + DB_URGENT_NEED_FLUSH_SECS); + if (db_need_flush_secs == 0) + db_need_flush_secs = db_urgent_need_flush_secs; + } + } else { + b->flush |= new_bits; + if (db_need_flush_secs == 0 + || db_need_flush_secs > db_time.tv_sec+DB_NEED_FLUSH_SECS) + db_need_flush_secs = db_time.tv_sec+DB_NEED_FLUSH_SECS; + } +} + + + +/* Shut down the database, including flushing and releasing all + * mmap()'ed buffers + * Do nothing to the files for mode=-1 because the file is new and garbage + * or the caller is a fork of the server shedding memory. */ +u_char +db_close(int mode) /* -1=invalidate, 0=dirty, 1=clean */ +{ + u_char result; + + if (mode >= 0) { + /* flush the data and then release and flush the dirty flags */ + result = make_clean(mode == 0 ? 0 : 1); + if (!db_unload(0, 0)) + result = 0; + } else { + db_invalidate = 1; + rel_db_states(); + result = (db_unload(0, 0) > 0); + } + + /* Close the hash table first because the server is often + * waiting for the lock on the main file held by dbclean. + * Destroy the hash table if it is bad */ + if (db_hash_fd >= 0) { + if (0 > close(db_hash_fd)) { + dcc_pemsg(EX_IOERR, 0, "close(%s): %s", + db_hash_nm, ERROR_STR()); + result = 0; + } + db_hash_fd = -1; + } + if (db_fd >= 0) { + if (0 > close(db_fd)) { + dcc_pemsg(EX_IOERR, 0, "close(%s): %s", + db_nm, ERROR_STR()); + result = 0; + } + db_fd = -1; + } + + db_locked.tv_sec = 0; + return result; +} + + + +/* Delete the hash table if the system is being rebooted and we + * don't trust the file system to get all of the hash table. This might + * make system shut down faster */ +void +db_stop(void) +{ + if (db_hash_fd < 0 + || !DB_IS_LOCKED() + || !db_not_synced + || db_hash_nm[0] == '\0') + return; + + if (0 > unlink(db_hash_nm) + && errno != ENOENT) + dcc_error_msg("unlink(%s): %s", db_hash_nm, ERROR_STR()); +} + + + +/* see if (another) instance of dbclean is already running */ +static int dbclean_lock_fd = -1; +static DCC_PATH dbclean_lock_nm; + +u_char /* 1=no (other) dbclean */ +lock_dbclean(DCC_EMSG emsg, const char *cur_db_nm) +{ + char pid[32]; + int i; + + fnm2rel_good(dbclean_lock_nm, cur_db_nm, DB_LOCK_SUFFIX); + dbclean_lock_fd = dcc_lock_open(emsg, dbclean_lock_nm, + O_RDWR|O_CREAT, + DCC_LOCK_OPEN_NOWAIT, + DCC_LOCK_ALL_FILE, 0); + if (dbclean_lock_fd < 0) + return 0; + + i = 1+snprintf(pid, sizeof(pid), "%ld\n", (long)getpid()); + if (i != write(dbclean_lock_fd, pid, i)) + dcc_logbad(EX_IOERR, "write(%s, pid): %s", + dbclean_lock_nm, ERROR_STR()); + + /* Let anyone write in it in case we are running as root + * and get interrupted by a crash or gdb. A stray, stale + * private lock file cannot be locked */ + chmod(dbclean_lock_nm, 0666); + + return 1; +} + + + +void +unlock_dbclean(void) +{ + if (dbclean_lock_fd >= 0) { + if (0 > unlink(dbclean_lock_nm)) + dcc_error_msg("unlink(%s): %s", + dbclean_lock_nm, ERROR_STR()); + close(dbclean_lock_fd); + dbclean_lock_fd = -1; + } +} + + + +/* This locking does only multiple-readers/single-writer */ +int /* -1=failed, 0=was not locked, 1=was */ +db_lock(void) +{ + struct stat sb; + + if (DB_IS_LOCKED()) + return 1; + + if (!dcc_exlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, 15*60, "", db_nm)) + return -1; + if (0 > fstat(db_fd, &sb)) { + db_failure(__LINE__,__FILE__, EX_IOERR, 0, + "stat(%s): %s", db_nm, ERROR_STR()); + return -1; + } + if (db_fsize != (DB_HOFF)sb.st_size) { + if (db_fsize > (DB_HOFF)sb.st_size || !db_rdonly) { + db_failure(__LINE__,__FILE__, EX_IOERR, 0, + "%s size changed from "OFF_HPAT + " to "OFF_HPAT, + db_nm, db_fsize, sb.st_size); + return -1; + } + db_fsize = sb.st_size; + } + + db_locked = db_time; + return 0; +} + + + +/* flush buffers to make the disk reasonably correct but not perfect + * This does not compensate for a lack of coherent mmap() in the system. + * + * It leaves the disk only as accurate as implied by db_not_synced. + * This flushes buffers marked either urgent and ordinarily dirty. + * If db_not_synced is set, then non-urgent dirty bits are not set. */ +static u_char +make_clean_flush(void) +{ + DB_BUF *b; + u_char result; + + result = 1; + for (b = buf_oldest; b != 0; b = b->newer) { + if (b->buf_type == DB_BUF_TYPE_FREE) + continue; + if (!buf_flush(0, b, 0)) + result = 0; + } + + return result; +} + + + +/* push all of our database changes to the disk and try to clear the dirty bit + * do not necessarily unmap anything */ +u_char +make_clean(u_char clean) /* 0=leave hash marked dirty, */ +{ /* 1=marked clean, 2=fsync */ + u_char need_db_fsync, result; + struct stat sb; + + rel_db_states(); + + result = 1; + + /* quit if we are giving up */ + if (db_invalidate) + return result; + + if (db_failed_line) + clean = 0; + + if (!make_clean_flush()) { + clean = 0; + result = 0; + } + + /* simply unlock all of the buffers if they are clean + * and do not need to (or cannot) be synchronized with fsync() */ + if (!db_dirty + && (clean < 2 /* not asked to synchronize */ + || db_rdonly /* cannot be synchronized */ + || !db_not_synced)) /* does not need to be synchronized */ + return result; + + need_db_fsync = (clean == 2); + + /* Send the meta-data to disk so that other processes + * such as dbclean can find the new length of the file + * on Solaris. Otherwise the file looks broken because + * its contained data length can be larger than its + * inode size on Solaris. */ + if (!need_db_fsync && clean) { + if (0 > fstat(db_fd, &sb)) { + dcc_error_msg("make_clean fstat(%s): %s", + db_nm, ERROR_STR()); + need_db_fsync = 1; + } else if (db_fsize != (DB_HOFF)sb.st_size) { + if (db_debug) + quiet_trace_msg("need fsync() because db_fsize=" + OFF_HPAT" but stat="OFF_HPAT, + db_fsize, sb.st_size); + need_db_fsync = 1; + } + } + + if (need_db_fsync + && 0 > fsync(db_fd)) { + dcc_error_msg("make_clean fsync(%s): %s", + db_nm, ERROR_STR()); + clean = 0; + result = 0; + } + + if (clean && !map_hash_ctl(0, 0)) { + clean = 0; + result = 0; + } + if (clean == 2) { + if (0 > fsync(db_hash_fd)) { + dcc_error_msg("make_clean fsync(%s): %s", + db_hash_nm, ERROR_STR()); + clean = 0; + result = 0; + } else { + db_not_synced = 0; + db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_NOSYNC; + SET_FLUSH_HCTL(1); + if (!make_clean_flush()) { + clean = 0; + result = 0; + } + } + } + + /* Clean the dirty flag in the hash table. + * With luck, this will reach the disk after everything else. */ + if (clean + && !(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_CLEAN)) { + db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_CLEAN; + SET_FLUSH_HCTL(0); + } + + /* finally flush the flag in the hash table */ + rel_db_states(); + if (!make_clean_flush()) + result = 0; + + if (clean) + db_dirty = 0; + return result; +} + + + +/* mark the hash file and so the database dirty */ +static u_char +db_make_dirty(DCC_EMSG emsg) +{ + if (db_dirty) + return 1; + + if (!DB_IS_LOCKED()) { + dcc_logbad(EX_SOFTWARE, "dirtying unlocked database"); + return 0; + } + + if (db_rdonly) + dcc_logbad(EX_SOFTWARE, "dirtying read-only database"); + + if (!map_hash_ctl(emsg, 0)) + return 0; + db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_CLEAN; +#ifdef USE_MAP_NOSYNC + if (!(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_NOSYNC)) { + db_sts.hash_ctl.d.vals->s.synced = time(0); + db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC; + } + db_not_synced = 1; +#endif + + SET_FLUSH_HCTL(1); + if (!buf_flush_part(emsg, db_sts.hash_ctl.b, 0, 0)) + return 0; + + db_dirty = 1; + return 1; +} + + + +/* (start to) unlock the database */ +u_char /* 0=failed, 1=at least started */ +db_unlock(void) +{ + DB_BUF *b; + int result; + + if (!DB_IS_LOCKED()) + return 1; + + /* Clear the dirty bit in the database because we may not + * be able to lock the database later to clear the dirty bit. + * Dbclean needs to see the dirty bit clear. */ + result = make_clean(1); + + /* Release DB_BUF_FG_USE_WRITE buffers because they are not consistent + * among processes + * Release everything if dccd wants stay out of RAM in favor + * of dbclean */ + for (b = buf_oldest; b != 0; b = b->newer) { + if (b->buf_type == DB_BUF_TYPE_FREE) + continue; + if (db_minimum_map + || (b->flags & DB_BUF_FG_USE_WRITE)) + buf_munmap(0, b); + } + + if (!dcc_unlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, "", db_nm)) + result = 0; + db_locked.tv_sec = 0; + return result; +} + + + +static const char * +mbyte2str(DB_PTR val) +{ + return db_ptr2str(val*1024*1024); +} + + + +#if defined(RLIMIT_AS) || defined(RLIMIT_RSS) || defined(RLIMIT_FSIZE) +static DB_PTR +use_rlimit(int resource, const char *rlimit_nm, + DB_PTR cur_val, DB_PTR min_val, const char *val_nm) +{ + struct rlimit limit_old, limit_new; + DB_PTR new_val; + + if (0 > getrlimit(resource, &limit_old)) { + dcc_error_msg("getrlimit(%s): %s", rlimit_nm, ERROR_STR()); + return cur_val; + } + + if ((DB_PTR)limit_old.rlim_cur >= cur_val+DB_PAD_MBYTE*1024) + return cur_val; + + /* assume we are root and try to increase the hard limit */ + if ((DB_PTR)limit_new.rlim_max < cur_val+DB_PAD_BYTE) { + limit_new = limit_old; + limit_new.rlim_max = cur_val+DB_PAD_BYTE; + if (0 > setrlimit(resource, &limit_new)) { + if (db_debug) + quiet_trace_msg("setrlimit(%s, " + L_DPAT","L_DPAT"): %s", + rlimit_nm, + (DB_PTR)limit_new.rlim_cur, + (DB_PTR)limit_new.rlim_max, + ERROR_STR()); + } else { + if (0 > getrlimit(resource, &limit_old)) { + dcc_error_msg("getrlimit(%s): %s", + rlimit_nm, ERROR_STR()); + return cur_val; + } + } + } + + limit_new = limit_old; + if ((DB_PTR)limit_new.rlim_max < min_val+DB_PAD_BYTE) + limit_new.rlim_max = min_val + DB_PAD_BYTE; + limit_new.rlim_cur = limit_new.rlim_max; + if ((DB_PTR)limit_new.rlim_cur > cur_val+DB_PAD_BYTE) + limit_new.rlim_cur = cur_val+DB_PAD_BYTE; + if (0 > setrlimit(resource, &limit_new)) { + dcc_error_msg("setrlimit(%s, "L_DPAT","L_DPAT"): %s", + rlimit_nm, + (DB_PTR)limit_new.rlim_cur, + (DB_PTR)limit_new.rlim_max, + ERROR_STR()); + new_val = limit_old.rlim_cur - DB_PAD_BYTE; + if (new_val < min_val) + new_val = min_val; + } else { + if (limit_old.rlim_cur < limit_new.rlim_cur + && db_debug) + quiet_trace_msg("increased %s from %s to %s", + rlimit_nm, + db_ptr2str(limit_old.rlim_cur), +#ifdef RLIM_INFINITY + (limit_new.rlim_cur == RLIM_INFINITY) + ? "infinity" : +#endif + db_ptr2str(limit_new.rlim_cur)); + new_val = limit_new.rlim_cur - DB_PAD_BYTE; + } + + if (cur_val > new_val) { + quiet_trace_msg("%s reduced %s from %s to %s", + rlimit_nm, val_nm, + db_ptr2str(cur_val), + db_ptr2str(new_val)); + return new_val; + } + + return cur_val; +} +#endif + + + +static void +get_db_max_rss(void) +{ + DB_PTR old_val, new_val, db_min_mbyte, db_min_byte, db_max_mbyte; + int physmem_str_len; + DB_PTR physmem; + + /* use default maximum if maximum is bogus or unset by ./configure */ + db_max_mbyte = MAX_MAX_DB_MBYTE; +#if DB_MAX_MBYTE != 0 + db_max_mbyte = DB_MAX_MBYTE; + if (db_max_mbyte < DB_MIN_MIN_MBYTE + || db_max_mbyte > MAX_MAX_DB_MBYTE) { + quiet_trace_msg("ignore bad ./configure --with-max-db-mem=%d", + DB_MAX_MBYTE); + db_max_mbyte = MAX_MAX_DB_MBYTE; + } else if (db_debug) { + quiet_trace_msg("DB max=%s" + " from ./configure --with-max-db-mem=%d", + mbyte2str(db_max_mbyte), DB_MAX_MBYTE); + } +#endif +#ifndef HAVE_BIG_FILES + /* we need big off_t for files larger than 2 GBytes */ + if (db_max_mbyte > DB_MAX_2G_MBYTE) { + old_val = db_max_mbyte; + db_max_mbyte= DB_MAX_2G_MBYTE; + if (db_debug) + quiet_trace_msg("32-bit off_t reduced DB max from %s" + " to %s", + mbyte2str(old_val), + mbyte2str(db_max_mbyte)); + } +#endif + + /* use default if ./configure --with-db-memory=MB is bogus or unset */ +#if DB_MIN_MBYTE == 0 + db_min_mbyte = 64; +#else + db_min_mbyte = DB_MIN_MBYTE; + if (db_min_mbyte < DB_MIN_MIN_MBYTE) { + quiet_trace_msg("ignore bad ./configure --with-db-memory=%d", + DB_MIN_MBYTE); + db_min_mbyte = DB_DEF_MIN_MBYTE; + } else if (db_min_mbyte > db_max_mbyte) { + quiet_trace_msg("ignore ./configure --with-db-memory=%d" + " > DB max=%s", + mbyte2str(db_max_mbyte)); + db_min_mbyte = DB_DEF_MIN_MBYTE; + } else if (db_debug) { + quiet_trace_msg("use ./configure --with-db-memory=%d", + DB_MIN_MBYTE); + } +#endif + + db_min_byte = db_min_mbyte * (1024*1024); + db_max_byte = db_max_mbyte * (1024*1024); + +#ifdef RLIMIT_FSIZE + db_max_mbyte = (use_rlimit(RLIMIT_FSIZE, "RLIMIT_FSIZE", + db_max_byte, db_min_byte, "DB max") + / (1024*1024)); + db_max_byte = db_max_mbyte * (1024*1024); +#endif /* RLIMIT_FSIZE */ + + physmem = 0; +#ifdef HAVE_PHYSMEM_TOTAL + /* maybe someday physmem_total() will be widely available */ + physmem = physmem_total(); + if (db_debug) + quiet_trace_msg("real=%s from physmem_total()", + db_ptr2str(physmem)); +#endif +#ifdef HAVE__SC_PHYS_PAGES + if (physmem == 0) { + long pages, sizepage; + + if ((pages = sysconf(_SC_PHYS_PAGES)) == -1) { + dcc_error_msg("sysconf(_SC_PHYS_PAGES): %s", + ERROR_STR()); + } else if ((sizepage = sysconf(_SC_PAGESIZE)) == -1) { + dcc_error_msg("sysconf(_SC_PAGESIZE): %s", + ERROR_STR()); + } else { + physmem = (DB_PTR)pages * (DB_PTR)sizepage; + if (db_debug) + quiet_trace_msg("real=%s" + " from sysconf(_SC_PHYS_PAGES)" + " and sysconf(_SC_PAGESIZE)", + db_ptr2str(physmem)); + } + } +#endif +#ifdef HAVE_HW_PHYSMEM + if (physmem == 0) { + int mib[2] = {CTL_HW, HW_PHYSMEM}; + unsigned long int hw_physmem; + size_t hw_physmem_len; + + hw_physmem_len = sizeof(hw_physmem); + if (0 > sysctl(mib, 2, &hw_physmem, &hw_physmem_len, 0,0)) { + dcc_error_msg("sysctl(HW_PHYSMEM): %s", ERROR_STR()); + } else { + physmem = hw_physmem; + if (db_debug) + quiet_trace_msg("real=%s from sysctl(mib)", + db_ptr2str(physmem)); + } + } +#endif +#ifdef HAVE_PSTAT_GETSTATIC + if (physmem == 0) { + struct pst_static pss; + + if (0 > pstat_getstatic(&pss, sizeof pss, 1, 0)) { + dcc_error_msg("pstat_getstatic(): %s", ERROR_STR()); + } else if (pss.physical_memory <= 0 + || pss.page_size < 0) { + dcc_error_msg("pstat_getstatic() says" + " physical_memory=%d page_size=%d", + pss.physical_memory, pss.page_size); + } else { + physmem = ((DB_PTR)pss.physical_memory + * (DB_PTR)pss.page_size); + if (db_debug) + quiet_trace_msg("real=%s" + " from pstat_getstatic()", + db_ptr2str(physmem)); + } + } +#endif + + physmem_str_len = 0; + db_physmem_str[0] = '\0'; + if (physmem == 0) { + quiet_trace_msg("failed to get real memory size"); + } else { + physmem_str_len = snprintf(db_physmem_str, + sizeof(db_physmem_str), + " real=%s", + db_ptr2str(physmem)); + + /* Try to use half of physical memory + * if there is less than 2 GByte + * all except 512 MByte between 2 GByte and 4 GByte, + * and all but 1 GByte if there is more than 4 GByte */ + if (physmem/(1024*1024) < 2*1024) + new_val = physmem/2; + else if (physmem/(1024*1024) <= 4*1024) + new_val = physmem - 512*(1024*1024); + else + new_val = physmem - 1024*(1024*1024); + if (new_val < db_min_byte) { + if (db_debug) + quiet_trace_msg("real=%s would give DB max=%s" + " smaller than minimum %s", + db_ptr2str(physmem), + db_ptr2str(new_val), + mbyte2str(db_min_mbyte)); + new_val = db_min_byte; + } + if (db_max_byte > new_val) { + old_val = db_max_byte; + db_max_mbyte = new_val / (1024*1024); + db_max_byte = db_max_mbyte * (1024*1024); + if (db_debug) + quiet_trace_msg("real=%s reduced DB max" + " from %s to %s", + db_ptr2str(physmem), + db_ptr2str(old_val), + db_ptr2str(db_max_byte)); + } + } + + /* window need not be larger than the limit on the database size */ + db_max_rss = db_max_byte; + +#ifdef RLIMIT_AS + /* try not to break process virtual memory limit, + * but only if it is not ridiculously tiny */ + db_max_rss = use_rlimit(RLIMIT_AS, "RLIMIT_AS", + db_max_rss, db_min_byte, "max RSS"); +#endif /* RLIMIT_AS */ +#ifdef RLIMIT_RSS + /* try not to break process resident memory limit + * but only if it is not ridiculously tiny */ + db_max_rss = use_rlimit(RLIMIT_RSS, "RLIMIT_RSS", + db_max_rss, db_min_byte, "max RSS"); +#endif /* RLIMIT_RSS */ + + /* limit the database to the window size */ + if (db_max_byte > db_max_rss) { + old_val = db_max_mbyte; + db_max_mbyte = db_max_rss / (1024*1024); + db_max_byte = db_max_mbyte * (1024*1024); + if (db_debug) + quiet_trace_msg("max RSS reduced DB max from %s to %s", + mbyte2str(old_val), + mbyte2str(db_max_mbyte)); + } + +#ifndef HAVE_64BIT_PTR + /* We cannot use a window larger than 2 GBytes on most systems without + * big pointers. Among the things that break is trying to mmap() more + * than 2 GBytes. So limit the window on 32-bit systems to a little + * less than 2 GBytes and the database to not much more */ + if (db_max_rss > DB_MAX_2G_MBYTE*(1024*1024)) { + if (db_debug) + quiet_trace_msg("32-bit pointers reduced max RSS" + " from %s to %s", + db_ptr2str(db_max_rss), + mbyte2str(DB_MAX_2G_MBYTE)); + db_max_rss = DB_MAX_2G_MBYTE*(1024*1024); + new_val = db_max_rss+db_max_rss/4; + if (db_max_byte > new_val) { + old_val = db_max_mbyte; + db_max_mbyte = new_val / (1024*1024); + db_max_byte = db_max_mbyte * (1024*1024); + if (db_debug) + quiet_trace_msg("32-bit pointers reduced DB max" + " from %s to %s", + mbyte2str(old_val), + mbyte2str(db_max_mbyte)); + } + } +#endif + + snprintf(&db_physmem_str[physmem_str_len], + sizeof(db_physmem_str) - physmem_str_len, + " max RSS=%s DB max=%s", + db_ptr2str(db_max_rss), mbyte2str(db_max_mbyte)); +} + + + +/* Pick a buffer size that will hold an integral number of DB hash + * table entries and is a multiple of system's page size. + * The entire hash table should reside in memory + * if the system has enough memory. */ +u_int +db_get_pagesize(u_int old_pagesize, /* 0 or required page size */ + u_int tgt_pagesize) /* 0 or target page size */ +{ + u_int min_pagesize, max_pagesize; + + /* Ask the operating system only once so we don't get differing + * answers and so compute a varying page size. + * Some systems can't keep their stories straight. */ + if (db_max_rss == 0) + get_db_max_rss(); + + /* Compute the least common multiple of the system page and + * the DB hash table entry size. + * This will give us the smallest page size that we can use. */ + system_pagesize = getpagesize(); + min_pagesize = lcm(system_pagesize, sizeof(HASH_ENTRY)); + + /* The kludge to speed conversion of database addresses to page numbers + * and offsets on 32-bit systems depends on the page size being + * a multiple of 256 */ + if ((min_pagesize % (1<<DB_PTR_SHIFT)) != 0) + dcc_logbad(EX_SOFTWARE, "page size not a multiple of 256"); + + /* The DB buffer or page size must also be a multiple of the + * the end-of-page padding used in the main database file. */ + if (sizeof(DB_RCD) % DB_RCD_HDR_LEN != 0) + dcc_logbad(EX_SOFTWARE, + "DB padding size %d" + " is not a divisor of DB entry size %d", + DB_RCD_HDR_LEN, ISZ(DB_RCD)); + if (DB_RCD_LEN_MAX % DB_RCD_HDR_LEN != 0) + dcc_logbad(EX_SOFTWARE, + "DB record not a multiple of header size"); + min_pagesize = lcm(min_pagesize, DB_RCD_HDR_LEN); + + /* Use the old buffer size if available so we are not confused + * by padding at the ends of the old pages. + * Fail if it is impossible. This should cause dbclean to + * rebuild the database. */ + if (old_pagesize != 0) { + if ((old_pagesize % min_pagesize) != 0) + return 0; + /* adjust the number of buffers to fit our window size */ + db_buf_total = db_max_rss / old_pagesize; + if (db_buf_total < (int)DB_BUF_MIN) + return 0; + if (db_buf_total > DB_BUF_MAX) + db_buf_total = DB_BUF_MAX; + return old_pagesize; + } + + db_buf_total = DB_BUF_MAX; + max_pagesize = db_max_rss / db_buf_total; + max_pagesize -= max_pagesize % min_pagesize; + + /* If we have a target page size, try to use it instead of the + * maximum page size allowed by the resident set size. + * Normal DCC databases grow large and want pages as large as possible + * but greylist databases are often small. + * We also want a tiny page when first reading the parameters while + * opening. */ + if (tgt_pagesize != 0 && tgt_pagesize < max_pagesize) { + tgt_pagesize -= tgt_pagesize % min_pagesize; + if (tgt_pagesize < min_pagesize) + tgt_pagesize = min_pagesize; + return tgt_pagesize; + } else if (max_pagesize > min_pagesize) { + return max_pagesize; + } else { + return min_pagesize; + } +} + + + +/* (re)create the buffer pool + * The buffers are small blocks that point to the real mmap()'ed memory. + */ +u_char +db_buf_init(u_int old_pagesize, /* 0 or required page size */ + u_int tgt_pagesize) /* 0 or target page size */ +{ + DB_BUF *b, *bprev, *bnext; + int i; + + + db_pagesize = db_get_pagesize(old_pagesize, tgt_pagesize); + if (db_pagesize == 0) + return 0; + + /* The fragments of pages must be multiples of system pages + * so that msync() on Solaris can be given multiples of system + * pages. It's also a generally good idea. */ + db_pagesize_part = db_pagesize/DB_BUF_NUM_PARTS; + db_pagesize_part = ((db_pagesize_part + system_pagesize-1) + / system_pagesize) * system_pagesize; + + db_page_max = db_pagesize - DB_RCD_HDR_LEN; + db_hash_page_len = db_pagesize/sizeof(HASH_ENTRY); + + db_max_hash_entries = (MAX_HASH_ENTRIES + - MAX_HASH_ENTRIES % db_hash_page_len); + + memset(db_bufs, 0, sizeof(db_bufs)); + b = db_bufs; + buf_oldest = b; + bprev = 0; + for (i = db_buf_total; --i != 0; b = bnext) { + bnext = b+1; + b->older = bprev; + b->newer = bnext; + bprev = b; + } + b->older = bprev; + buf_newest = b; + + memset(db_buf_hash, 0, sizeof(db_buf_hash)); + + return 1; +} + + + +static u_char +make_new_hash(DCC_EMSG emsg, DB_HADDR new_hash_len) +{ + struct stat sb; + HASH_ENTRY *hash; + DB_HADDR next_haddr, cur_haddr, prev_haddr; + u_int pagenum; + + if (getuid() == 0) { + /* if we are running as root, + * don't change the owner of the database */ + if (0 > fstat(db_fd, &sb)) { + dcc_pemsg(EX_IOERR, emsg, "fstat(%s): %s", + db_nm, ERROR_STR()); + return 0; + } + if (0 > fchown(db_hash_fd, sb.st_uid, sb.st_gid)) { + dcc_pemsg(EX_IOERR, emsg, "fchown(%s,%d,%d): %s", + db_hash_nm, (int)sb.st_uid, (int)sb.st_gid, + ERROR_STR()); + return 0; + } + } + + if (new_hash_len < MIN_HASH_ENTRIES) + new_hash_len = MIN_HASH_ENTRIES; + + /* Increase the requested hash table size to a multiple of the database + * page size. The page size is chosen to be a multiple of the size of + * a single hash table entry. */ + db_hash_fsize = (((DB_HOFF)new_hash_len)*sizeof(HASH_ENTRY) + + db_pagesize-1); + db_hash_fsize -= db_hash_fsize % db_pagesize; + new_hash_len = db_hash_fsize / sizeof(HASH_ENTRY); + + if (new_hash_len > db_max_hash_entries) + new_hash_len = db_max_hash_entries; + + /* create the empty hash table file */ + rel_db_states(); + if (!db_unload(emsg, 0)) + return 0; + if (0 > ftruncate(db_hash_fd, 0)) { + dcc_pemsg(EX_IOERR, emsg, "truncate(%s,"L_HPAT"): %s", + db_hash_nm, db_csize, ERROR_STR()); + return 0; + } + + db_hash_len = new_hash_len; + db_hash_used_stored_hash = db_hash_used = DB_HADDR_BASE; + db_hash_divisor = get_db_hash_divisor(db_hash_len); + + /* Clear new hash file by linking its entries into the free list */ + /* map and clear the first page */ + if (!map_hash_ctl(emsg, 1)) + return 0; + + /* create the header */ + strcpy(db_sts.hash_ctl.d.vals->s.magic, HASH_MAGIC_STR); + db_sts.hash_ctl.d.vals->s.free_fwd = DB_HADDR_BASE; + db_sts.hash_ctl.d.vals->s.free_bak = db_hash_len-1; + db_sts.hash_ctl.d.vals->s.len = db_hash_len; + db_sts.hash_ctl.d.vals->s.divisor = db_hash_divisor; + db_sts.hash_ctl.d.vals->s.used = DB_HADDR_BASE; + db_sts.hash_ctl.d.vals->s.synced = time(0); + db_dirty = 1; +#ifdef USE_MAP_NOSYNC + db_sts.hash_ctl.d.vals->s.synced = time(0); + db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC; + db_not_synced = 1; +#endif + + /* Link the hash table entries in the first and following pages. + * The page size is chosen to be a multiple of the size of a + * single hash table entry. */ + prev_haddr = FREE_HADDR_END; + cur_haddr = DB_HADDR_BASE; + next_haddr = cur_haddr+1; + hash = &db_sts.hash_ctl.d.vals->h[DB_HADDR_BASE]; + pagenum = 0; + for (;;) { + do { + DB_HADDR_CP(hash->bak, prev_haddr); + if (next_haddr == db_hash_len) + DB_HADDR_CP(hash->fwd, FREE_HADDR_END); + else + DB_HADDR_CP(hash->fwd, next_haddr); + ++hash; + prev_haddr = cur_haddr; + cur_haddr = next_haddr++; + } while (cur_haddr % db_hash_page_len != 0); + + if (++pagenum >= db_hash_fsize/db_pagesize) + break; + + if (!map_hash(emsg, cur_haddr, &db_sts.free, 1)) + return 0; + db_sts.free.b->flush_urgent = (DB_BUF_FM)-1; + hash = db_sts.free.d.h; + } + + hash_clear_pg_num = 0; + + return 1; +} + + + +static u_char +check_old_hash(DCC_EMSG emsg) +{ + static const u_char magic[sizeof(((HASH_CTL*)0)->s.magic) + ] = HASH_MAGIC_STR; + const HASH_CTL *vals; + struct stat sb; + u_char old_db; + + /* check the size of the existing hash file */ + if (0 > fstat(db_hash_fd, &sb)) { + dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s", + db_hash_nm, ERROR_STR()); + return 0; + } + db_hash_fsize = sb.st_size; + if ((db_hash_fsize % sizeof(HASH_ENTRY)) != 0) { + dcc_pemsg(EX_DATAERR, emsg, "%s has size "OFF_DPAT"," + " not a multiple of %d", + db_hash_nm, db_hash_fsize, + ISZ(HASH_ENTRY)); + return 0; + } + + db_hash_len = db_hash_fsize/sizeof(HASH_ENTRY); + if (db_hash_len < MIN_HASH_ENTRIES) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has too few records, "OFF_DPAT" bytes", + db_hash_nm, db_hash_fsize); + return 0; + } + + /* check the magic number */ + if (!map_hash_ctl(emsg, 0)) + return 0; + vals = db_sts.hash_ctl.d.vals; + if (memcmp(vals->s.magic, &magic, sizeof(magic))) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has the wrong magic \"%.*s\"", + db_hash_nm, ISZ(HASH_ENTRY), vals->s.magic); + return 0; + } + + if (!(vals->s.flags & HASH_CTL_FG_CLEAN)) { + dcc_pemsg(EX_DATAERR, emsg, "%s was not closed cleanly", + db_hash_nm); + return 0; + } + if (vals->s.flags & HASH_CTL_FG_NOSYNC) { +#ifdef HAVE_BOOTTIME + int mib[2] = {CTL_KERN, KERN_BOOTTIME}; + size_t boottime_len; +#endif + struct timeval boottime; + + boottime.tv_sec = 0x7fffffff; +#ifdef HAVE_BOOTTIME + boottime_len = sizeof(boottime); + if (0 > sysctl(mib, 2, &boottime, &boottime_len, 0, 0)) { + dcc_error_msg("sysctl(KERN_BOOTTIME): %s", ERROR_STR()); + } +#endif + if (vals->s.synced <= boottime.tv_sec) { + dcc_pemsg(EX_DATAERR, emsg, "%s was not synchronized;" + " synced=%d boottime=%d", + db_hash_nm, + (int)vals->s.synced, (int)boottime.tv_sec); + return 0; + } + db_not_synced = 1; + } + + if (DB_HADDR_INVALID(vals->s.free_fwd) + && (vals->s.free_fwd != FREE_HADDR_END + || vals->s.free_fwd != vals->s.free_bak)) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has a broken free list head of %#x", + db_hash_nm, vals->s.free_fwd); + return 0; + } + if (DB_HADDR_INVALID(vals->s.free_bak) + && (vals->s.free_bak != FREE_HADDR_END + || vals->s.free_fwd != vals->s.free_bak)) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has a broken free list tail of %#x", + db_hash_nm, vals->s.free_bak); + return 0; + } + + if (db_hash_len != vals->s.len) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has %d entries but claims %d", + db_hash_nm, db_hash_len, + vals->s.len); + return 0; + } + + db_hash_divisor = vals->s.divisor; + if (db_hash_divisor < MIN_HASH_DIVISOR + || db_hash_divisor >= db_hash_len) { + dcc_pemsg(EX_DATAERR, emsg, "%s has hash divisor %d", + db_hash_nm, db_hash_len); + return 0; + } + + db_hash_used_stored_hash = db_hash_used = vals->s.used; + if (db_hash_used < DB_HADDR_BASE) { + dcc_pemsg(EX_DATAERR, emsg, + "%s contains impossible %u entries", + db_hash_nm, HADDR2LEN(db_hash_used)); + return 0; + } + if (db_hash_used >= db_hash_len) { + if (db_hash_used > db_hash_len) + dcc_pemsg(EX_DATAERR, emsg, + "%s contains only %u entries but %u used", + db_hash_nm, + HADDR2LEN(db_hash_len), + HADDR2LEN(db_hash_used)); + else + dcc_pemsg(EX_DATAERR, emsg, + "%s is filled with %u entries", + db_hash_nm, + HADDR2LEN(db_hash_len)); + return 0; + } + + /* old databases lack the growth values */ + old_db = 0; + if (!db_rdonly + && db_parms.old_db_csize == 0 + && db_parms.db_added == 0 + && db_parms.hash_used == 0 + && db_parms.old_hash_used == 0 + && db_parms.hash_added == 0 + && db_parms.rate_secs == 0 + && db_parms.last_rate_sec == 0) { + quiet_trace_msg("repair database growth measurements"); + db_parms.old_db_csize = db_parms.db_csize; + old_db = 1; + } + + if (db_hash_used != db_parms.hash_used + && db_hash_fsize != 0) { + if (old_db) { + quiet_trace_msg("repair db_parms.old hash_used" + " and old_hash_used"); + db_parms.old_hash_used = db_hash_used; + db_parms.hash_used = db_hash_used; + } else { + dcc_pemsg(EX_DATAERR, emsg, + "%s contains %d" + " entries instead of the %d that %s claims", + db_hash_nm, db_hash_used, + db_parms.hash_used, db_nm); + return 0; + } + } + + db_csize_stored_hash = vals->s.db_csize; + if (db_csize_stored_hash != db_csize + && db_hash_fsize != 0) { + dcc_pemsg(EX_DATAERR, emsg, + "%s contains "L_DPAT + " bytes instead of the "L_DPAT" that %s claims", + db_nm, db_csize, + db_csize_stored_hash, db_hash_nm); + return 0; + } + + return 1; +} + + + +/* open the files and generally get ready to work */ +u_char /* 0=failed, 1=ok */ +db_open(DCC_EMSG emsg, + int new_db_fd, /* -1 or already open db_fd */ + const char *new_db_nm, + DB_HADDR new_hash_len, /* 0 or # of entries */ + DB_OPEN_MODES mode) /* DB_OPEN_* */ +{ + u_int cur_pagesize; + int hash_flags, db_open_flags; + struct stat db_sb; +# define OPEN_BAIL() {if (new_db_fd >= 0) db_fd = -1; \ + db_close(-1); return 0;} + + db_close(1); + db_failed_line = __LINE__; + db_failed_file = __FILE__; + db_not_synced = 0; + db_minimum_map = 0; + db_invalidate = 0; + db_dirty = 0; + db_locked.tv_sec = 0; + + db_rdonly = (mode & DB_OPEN_RDONLY) != 0; + db_use_write = (mode & DB_OPEN_MMAP_WRITE) != 0; + + memset(&db_stats, 0, sizeof(db_stats)); + + if (!new_db_nm && db_nm[0] == '\0') + new_db_nm = grey_on ? DB_GREY_NAME : DB_DCC_NAME; + if (new_db_nm) { + if (!fnm2rel(db_nm, new_db_nm, 0) + || !fnm2rel(db_hash_nm, db_nm, DB_HASH_SUFFIX)) { + dcc_pemsg(EX_DATAERR, emsg, + "invalid DB nm \"%s\"", new_db_nm); + return 0; + } + } + + if (new_db_fd >= 0) { + if (new_hash_len != 0) { + dcc_logbad(EX_SOFTWARE, + "extending db_open(%s) without locking", + db_nm); + return 0; + } + if (!db_rdonly) { + dcc_logbad(EX_SOFTWARE, + "db_open(%s) read/write without locking", + db_nm); + return 0; + } + db_open_flags = O_RDONLY; + hash_flags = O_RDONLY; + + db_fd = new_db_fd; + + } else { + db_open_flags = O_RDWR; + if (new_hash_len != 0) { + if (db_rdonly) { + dcc_logbad(EX_SOFTWARE, + "db_open(%s) creating read-only", + db_nm); + return 0; + } + hash_flags = O_RDWR | O_CREAT; + } else { + /* must open the file read/write to lock it */ + hash_flags = O_RDWR; + } + + db_fd = dcc_lock_open(emsg, db_nm, db_open_flags, + (mode & DB_OPEN_LOCK_NOWAIT) + ? DCC_LOCK_OPEN_NOWAIT + : 0, + DCC_LOCK_ALL_FILE, 0); + if (db_fd == -1) { + db_close(-1); + return 0; + } + } + gettimeofday(&db_time, 0); + db_locked = db_time; + if (0 > fstat(db_fd, &db_sb)) { + dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s", db_nm, ERROR_STR()); + OPEN_BAIL(); + return 0; + } + db_csize = db_fsize = db_sb.st_size; + if (db_fsize < ISZ(DB_HDR)) { + dcc_pemsg(EX_IOERR, emsg, + "%s with %d bytes is too small to be a DCC database", + db_nm, (int)db_fsize); + OPEN_BAIL(); + } + + /* check the header of the database file by temporarily mapping it */ + db_buf_init(0, sizeof(DB_HDR)); + if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0)) + OPEN_BAIL(); + + db_parms_stored = *db_sts.db_parms.d.parms; + db_parms = *db_sts.db_parms.d.parms; + + if (memcmp(db_parms.version, db_version_buf, sizeof(db_version_buf))) { + dcc_pemsg(EX_DATAERR, emsg, + "%s contains the wrong magic string \"%.*s\"", + db_nm, ISZ(db_parms.version), db_parms.version); + OPEN_BAIL(); + } + if (!(db_parms.flags & DB_PARM_FG_GREY) != !grey_on) { + dcc_pemsg(EX_DATAERR, emsg, + "%s is%s a greylist database but must%s be", + db_nm, + (db_parms.flags & DB_PARM_FG_GREY) ? "" : " not", + grey_on ? "" : " not"); + OPEN_BAIL(); + } + + cur_pagesize = db_parms.pagesize; + + DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_INVALID); + DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_FLOD_PATH); + set_db_tholds(db_parms.nokeep_cks); + + db_ck_fuzziness = grey_on ? grey_ck_fuzziness : dcc_ck_fuzziness; + + db_csize = db_parms.db_csize; + if (db_csize < sizeof(DB_HDR)) { + dcc_pemsg(EX_DATAERR, emsg, + "%s says it contains "L_DPAT" bytes" + " or fewer than the minimum of %d", + db_nm, db_csize, DB_PTR_BASE); + /* that is a fatal error if we are not rebuilding */ + if (new_hash_len != 0) + OPEN_BAIL(); + } + if (db_csize > db_fsize) { + dcc_pemsg(EX_DATAERR, emsg, + "%s says it contains "L_DPAT" bytes" + " or more than the actual size of "OFF_DPAT, + db_nm, db_csize, db_fsize); + /* that is a fatal error if we are not rebuilding */ + if (new_hash_len != 0) + OPEN_BAIL(); + } + + /* The buffer or page size we use must be the page size used to + * write the files. Try to change our size to match the file */ + if (cur_pagesize != db_pagesize) { + db_invalidate = 1; + rel_db_states(); + if (!db_unload(emsg, 0)) + OPEN_BAIL(); + db_invalidate = 0; + if (!db_buf_init(cur_pagesize, 0)) { + dcc_error_msg("%s has page size %d" + " incompatible with %d in %s", + db_nm, + cur_pagesize, db_get_pagesize(0, 0), + path2fnm(db_hash_nm)); + OPEN_BAIL(); + } + } + + db_csize_stored_hash = 0; + db_hash_len = 0; + db_hash_fd = open(db_hash_nm, hash_flags, 0666); + if (db_hash_fd < 0) { + dcc_pemsg(EX_IOERR, emsg, "open(%s): %s", + db_hash_nm, ERROR_STR()); + OPEN_BAIL(); + } + if (0 > fcntl(db_hash_fd, F_SETFD, FD_CLOEXEC)) { + dcc_pemsg(EX_IOERR, emsg, "fcntl(%s, FD_CLOEXEC): %s", + db_hash_nm, ERROR_STR()); + OPEN_BAIL(); + } + + if (new_hash_len != 0) { + if (!make_new_hash(emsg, new_hash_len)) + OPEN_BAIL(); + } else { + if (!check_old_hash(emsg)) + OPEN_BAIL(); + } + + if (db_fsize % db_pagesize != 0) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has size "OFF_HPAT"," + " not a multiple of its page size of %#x", + db_nm, db_fsize, db_pagesize); + OPEN_BAIL(); + } + if (db_fsize > db_csize + db_pagesize || db_csize > db_fsize) { + dcc_pemsg(EX_DATAERR, emsg, + "%s has size "OFF_HPAT" but claims "L_HPAT, + db_nm, db_fsize, db_csize); + OPEN_BAIL(); + } + +#ifndef USE_MAP_NOSYNC + /* Use `dbclean -F` on systems without mmap(NOSYNC) but with lots of + * RAM. Some Linux systems otherwise take too long to run dbclean. */ + if (mode & DB_OPEN_MMAP_WRITE_NOSYNC) { + if (db_max_rss > db_fsize + db_hash_fsize) + db_use_write = 1; + if (db_debug) + quiet_trace_msg("db_max_rss="OFF_HPAT + " db_fsize+db_hash_fsize="OFF_HPAT + " so%s use -F", + db_max_rss, db_fsize+db_hash_fsize, + db_use_write ? "" : " do not"); + } +#endif + + db_window_size = (DB_PTR)db_pagesize * db_buf_total; + snprintf(db_window_size_str, sizeof(db_window_size_str), + "window=%s%s", + db_ptr2str(db_window_size), db_physmem_str); + rel_db_states(); + db_failed_line = 0; + + return 1; +#undef OPEN_BAIL +} + + + +static u_char +buf_munmap(DCC_EMSG emsg, DB_BUF *b) +{ + u_char result; + + if (b->lock_cnt != 0) + dcc_logbad(EX_SOFTWARE, "unmapping locked DB buffer"); + + result = buf_flush(emsg, b, 1); + + if (db_invalidate) { + if (0 > DCC_MADV_FREE(b->buf.v)) + dcc_error_msg("madvise(FREE %s,%#x): %s", + buf2path(b), db_pagesize, ERROR_STR()); + } + + if (0 > munmap(b->buf.v, db_pagesize)) { + db_failure(__LINE__,__FILE__, EX_IOERR, emsg, + "munmap(%s,%d): %s", + buf2path(b), db_pagesize, ERROR_STR()); + result = 0; + } + b->buf.v = 0; + b->pg_num = -1; + b->buf_type = DB_BUF_TYPE_FREE; + + return result; +} + + + +static u_char +buf_mmap(DCC_EMSG emsg, DB_BUF *b, DB_PG_NUM pg_num, u_char extend) +{ + int prot, flags; + off_t offset; + int fd; + void *p; + int retry; + u_char unloaded; + + + offset = (off_t)pg_num * (off_t)db_pagesize; + fd = buf2fd(b); + + if (extend) { + offset = 0; +#if defined(MAP_ANON)|| defined(MAP_ANONYMOUS) + fd = -1; + b->flags |= DB_BUF_FG_USE_WRITE | DB_BUF_FG_EXTENSION; +#ifdef MAP_ANONYMOUS + /* Linux redefines things and requires either MAP_ANON + * or MAP_PRIVATE; */ + flags = MAP_ANONYMOUS| MAP_PRIVATE; +#else + flags = MAP_ANON | MAP_PRIVATE; +#endif /* MAP_ANONYMOUS */ +#else /* have neither MAP_ANON nor MAP_ANONYMOUS */ + b->flags |= DB_BUF_FG_USE_WRITE; + flags = MAP_PRIVATE; +#endif + } else if (db_rdonly) { + flags = MAP_SHARED; + } else if (db_use_write && !db_minimum_map) { + /* write() buffers instead of letting the Solaris virtual + * memory system do it. Solaris will bog the system down doing + * nothing but flushing dirty mmap() pages + * We cannot use this hack in two processes simultaneously, + * so do not use it in dccd while dbclean is running */ + b->flags |= DB_BUF_FG_USE_WRITE; + flags = MAP_PRIVATE; + } else { +#ifdef USE_MAP_NOSYNC + flags = (MAP_SHARED | MAP_NOSYNC); +#else + flags = MAP_SHARED; +#endif + } + + prot = db_rdonly ? PROT_READ : (PROT_READ | PROT_WRITE); + for (retry = 1, unloaded = 2; unloaded > 1; ++retry) { + p = mmap(0, db_pagesize, prot, flags, fd, offset); + + if (p == MAP_FAILED) { + if (errno == EACCES + || errno == EBADF + || errno == EINVAL + || errno == ENODEV + || retry > 20) { + dcc_pemsg(EX_IOERR, emsg, + "try #%d"" mmap(%s" + " %#x,%#x,%#x,%d,"OFF_HPAT"): %s", + retry, + buf2path(b), + db_pagesize, prot, flags, fd, offset, + ERROR_STR()); + return 0; + } + dcc_error_msg("try #%d mmap(%s" + " %#x,%#x,%#x,%d,"OFF_HPAT"): %s", + retry, + buf2path(b), + db_pagesize, prot, flags, fd, offset, + ERROR_STR()); +/* #define MMAP_FAIL_DEBUG 3 */ +#ifdef MMAP_FAIL_DEBUG + } else if (((uint)random() % MMAP_FAIL_DEBUG) == 0) { + /* pretend mmap() failed randomly */ + dcc_error_msg(" test fail #%d mmap(%s,%#x,"OFF_HPAT")", + retry, + buf2path(b), db_pagesize, offset); + if (0 > munmap(p, db_pagesize)) + dcc_error_msg( "test munmap(): %s", + ERROR_STR()); +#endif + } else { + /* It worked. + * Say so if it was not the first attempt. */ + if (retry != 1) + dcc_error_msg("try #%d" + " mmap(%s,%#x,"OFF_HPAT") ok", + retry, + buf2path(b), db_pagesize, offset); + break; + } + + /* mmap() fails occassionally on some systems, + * so try to release something and try again */ + unloaded = db_unload(0, 1); + } + + + b->buf.v = p; + b->flush = 0; + b->flush_urgent = 0; + + if (extend) + return 1; + + /* madvise() on some systems including FreeBSD uses a lot of CPU cycles, + * so it should not be done unless it is likely to do significant good. + * Get all of our buffers if there is plenty of memory + * and we are not trying to stay out of the way of dbclean. */ + if (!db_minimum_map && db_fsize <= db_max_rss) { + /* The flat file would fit. If the hash table would also + * fit, tell the kernel to be aggressive */ + if (db_fsize + db_hash_fsize <= db_max_rss + && 0 > DCC_MADV_WILLNEED(p)) + dcc_error_msg("madvise(WILLNEED %s,%#x): %s", + buf2path(b), db_pagesize, ERROR_STR()); + } else { + if (0 > DCC_MADV_RANDOM(p)) + dcc_error_msg("madvise(RANDOM %s,%#x): %s", + buf2path(b), db_pagesize, ERROR_STR()); + } + + return 1; +} + + + +/* get a free buffer for a chunk of either the hash table or database files */ +static DB_BUF * +get_free_buf(DCC_EMSG emsg, DB_BUF **bh) +{ + DB_BUF *b; + + /* Look for an unlocked buffer. + * We know there is one because we have more buffers than + * can be locked simultaneously. */ + b = buf_oldest; + for (;;) { + if (!b) + dcc_logbad(EX_SOFTWARE, "broken DB buffer MRU chain"); + if (!b->lock_cnt) + break; + b = b->newer; + } + + /* Found an unlocked buffer. + * Unlink it from its hash chain. */ + if (b->fwd) + b->fwd->bak = b->bak; + if (b->bak) + b->bak->fwd = b->fwd; + else if (b->hash) + *b->hash = b->fwd; + if (b->buf_type != DB_BUF_TYPE_FREE) { + if (!buf_munmap(emsg, b)) + return 0; + } + + b->flags = 0; + + /* put it on the new hash chain */ + b->bak = 0; + b->hash = bh; + b->fwd = *bh; + *bh = b; + if (b->fwd) + b->fwd->bak = b; + + return b; +} + + + +static DB_BUF * +find_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_PG_NUM pg_num) +{ + DB_BUF *b, **bh; + + bh = DB_BUF_HASH(pg_num, buf_type); + b = *bh; + for (;;) { + if (!b) { + /* we ran off the end of the buffer hash chain, + * so get a free buffer */ + b = get_free_buf(emsg, bh); + if (!b) + return 0; + b->buf_type = buf_type; + b->pg_num = pg_num; + break; + } + if (b->buf_type == buf_type + && b->pg_num == pg_num) + break; /* found the buffer we need */ + + b = b->fwd; + } + + /* make the buffer newest */ + if (buf_newest != b) { + /* unlink it */ + b->newer->older = b->older; + if (b->older) + b->older->newer = b->newer; + else + buf_oldest = b->newer; + /* insert it at the head of the MRU list */ + b->newer = 0; + b->older = buf_newest; + buf_newest->newer = b; + buf_newest = b; + } + + return b; +} + + + +static DB_BUF * +find_st_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_STATE *st, + DB_PG_NUM pg_num, u_char extend) +{ + DB_BUF *b; + + /* release previous buffer unless it is the right one */ + b = st->b; + if (b) { + if (b->pg_num == pg_num + && b->buf_type == buf_type) + return b; /* already have the target buffer */ + + st->b = 0; + st->d.v = 0; + if (--b->lock_cnt < 0) + dcc_logbad(EX_SOFTWARE, "bad database buffer lock"); + } + + /* look for the buffer */ + b = find_buf(emsg, buf_type, pg_num); + if (!b) + return 0; + + ++b->lock_cnt; + if (b->buf.v) { + if (extend && !(b->flags & DB_BUF_FG_USE_WRITE)) + dcc_logbad(EX_SOFTWARE, "extending ordinary buffer"); + + } else { + /* map it if it was not already known */ + if (!buf_mmap(emsg, b, pg_num, extend)) { + b->buf_type = DB_BUF_TYPE_FREE; + b->pg_num = -1; + if (--b->lock_cnt != 0) + dcc_logbad(EX_SOFTWARE, + "stolen database buffer lock %d", + b->lock_cnt); + return 0; + } + if (buf_type == DB_BUF_TYPE_DB) + ++db_stats.db_mmaps; + else if (buf_type == DB_BUF_TYPE_HASH) + ++db_stats.hash_mmaps; + } + + st->b = b; + st->d.v = 0; + return b; +} + + + +static u_char +map_hash_ctl(DCC_EMSG emsg, u_char new) +{ + DB_BUF *b; + + b = find_st_buf(emsg, DB_BUF_TYPE_HASH, &db_sts.hash_ctl, 0, new); + if (!b) + return 0; + db_sts.hash_ctl.s.haddr = 0; + db_sts.hash_ctl.d.v = b->buf.v; + return 1; +} + + + +/* mmap() a hash table entry */ +static u_char +map_hash(DCC_EMSG emsg, + DB_HADDR haddr, /* this entry */ + DB_STATE *st, /* point this to the entry */ + u_char new) +{ + DB_PG_NUM pg_num; + DB_PG_OFF pg_off; + DB_BUF *b; + + if (haddr >= db_hash_len || haddr < DB_HADDR_BASE) { + dcc_pemsg(EX_DATAERR, emsg, "invalid hash address %#x", + haddr); + return 0; + } + + pg_num = haddr / db_hash_page_len; + pg_off = haddr % db_hash_page_len; + + b = find_st_buf(emsg, DB_BUF_TYPE_HASH, st, pg_num, new); + if (!b) + return 0; + st->s.haddr = haddr; + st->d.h = &b->buf.h[pg_off]; + return 1; +} + + + +/* unlink a hash table entry from the free list + * uses db_sts.tmp */ +static u_char +unlink_free_hash(DCC_EMSG emsg, + DB_STATE *hash_st) /* remove this from the free list */ +{ + DB_HADDR fwd, bak; + + if (!db_make_dirty(emsg)) + return 0; + + fwd = DB_HADDR_EX(hash_st->d.h->fwd); + bak = DB_HADDR_EX(hash_st->d.h->bak); + if (!HE_IS_FREE(hash_st->d.h) + || (DB_HADDR_INVALID(fwd) && fwd != FREE_HADDR_END) + || (DB_HADDR_INVALID(bak) && bak != FREE_HADDR_END) + || DB_HPTR_EX(hash_st->d.h->rcd) != DB_PTR_NULL) { + dcc_pemsg(EX_DATAERR, emsg, + "bad hash free list entry at %#x", hash_st->s.haddr); + return 0; + } + + if (fwd != FREE_HADDR_END) { + if (!map_hash(emsg, fwd, &db_sts.tmp, 0)) + return 0; + if (DB_HADDR_EX(db_sts.tmp.d.h->bak) != hash_st->s.haddr) { + dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x", + hash_st->s.haddr, fwd); + return 0; + } + DB_HADDR_CP(db_sts.tmp.d.h->bak, bak); + SET_FLUSH_HE(&db_sts.tmp); + } else { + if (!map_hash_ctl(emsg, 0)) + return 0; + if (db_sts.hash_ctl.d.vals->s.free_bak != hash_st->s.haddr) { + dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x", + hash_st->s.haddr, fwd); + return 0; + } + db_sts.hash_ctl.d.vals->s.free_bak = bak; + SET_FLUSH_HCTL(0); + } + + if (bak != FREE_HADDR_END) { + if (!map_hash(emsg, bak, &db_sts.tmp, 0)) + return 0; + if (DB_HADDR_EX(db_sts.tmp.d.h->fwd) != hash_st->s.haddr) { + dcc_pemsg(EX_DATAERR, emsg, "bad free %#x <-- free %#x", + bak, hash_st->s.haddr); + return 0; + } + DB_HADDR_CP(db_sts.tmp.d.h->fwd, fwd); + SET_FLUSH_HE(&db_sts.tmp); + } else { + if (!map_hash_ctl(emsg, 0)) + return 0; + if (db_sts.hash_ctl.d.vals->s.free_fwd != hash_st->s.haddr) { + dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x", + hash_st->s.haddr, bak); + return 0; + } + db_sts.hash_ctl.d.vals->s.free_fwd = fwd; + SET_FLUSH_HCTL(0); + } + + memset(hash_st->d.h, 0, sizeof(HASH_ENTRY)); + SET_FLUSH_HE(hash_st); + + ++db_hash_used; + return 1; +} + + + +/* get a free hash table entry and leave db_sts.free pointing to it */ +static u_char /* 0=failed, 1=got it */ +get_free_hash(DCC_EMSG emsg, + DB_HADDR result) /* try near here */ +{ + DB_HADDR pg_start, pg_lim, bak; + int i; + + if (db_hash_len <= db_hash_used) { + dcc_pemsg(EX_OSFILE, emsg, "no free hash table entry;" + " %d of %d used", db_hash_used, db_hash_len); + return 0; + } + + /* Look first near the target */ + if (result < DB_HADDR_BASE) + result = DB_HADDR_BASE; + pg_start = result - (result % db_hash_page_len); + pg_lim = pg_start + db_hash_page_len-1; + if (pg_lim >= db_hash_len) + pg_lim = db_hash_len-1; + for (i = 0; i < 3 && ++result < pg_lim; ++i) { + if (!map_hash(emsg, result, &db_sts.free, 0)) + return 0; + if (HE_IS_FREE(db_sts.free.d.h)) + return unlink_free_hash(emsg, &db_sts.free); + } + + /* check the local ad hoc free list at the end of the page */ + if (!map_hash(emsg, pg_lim, &db_sts.free, 0)) + return 0; + if (HE_IS_FREE(db_sts.free.d.h)) { + /* the ad hoc free list is not empty, + * so try to use the previous entry */ + bak = DB_HADDR_EX(db_sts.free.d.h->bak); + if (bak != FREE_HADDR_END) { + if (!map_hash(emsg, bak, &db_sts.free, 0)) + return 0; + } + return unlink_free_hash(emsg, &db_sts.free); + } + + + /* Give up and search from the start of the free list. This happens + * only when the current and all preceding pages are full. */ + if (!map_hash_ctl(emsg, 0)) + return 0; + result = db_sts.hash_ctl.d.vals->s.free_fwd; + if (DB_HADDR_INVALID(result)) { + dcc_pemsg(EX_DATAERR, emsg, + "broken hash free list head of %#x", result); + return 0; + } + if (!map_hash(emsg, result, &db_sts.free, 0)) + return 0; + return unlink_free_hash(emsg, &db_sts.free); +} + + + +/* mmap() a database entry + * We assume that no database entry spans buffers, + * and that there are enough buffers to accomodate all possible + * concurrent requests. */ +static u_char +map_db(DCC_EMSG emsg, + DB_PTR rptr, /* address of the record */ + u_int tgt_len, /* its length */ + DB_STATE *st, /* point this to the record */ + u_char extend) +{ + DB_PG_NUM pg_num; + DB_PG_OFF pg_off; + DB_BUF *b; + + if (rptr+tgt_len > db_fsize) { + db_failure(__LINE__,__FILE__, EX_DATAERR, emsg, + "invalid database address "L_HPAT" or length %d" + " past db_fsize "OFF_HPAT" in %s", + rptr, tgt_len, db_fsize, db_nm); + return 0; + } + + /* Try to optimize this to avoid udivdi3() and umoddi3(), + * because they are a major time sink here on 32-bit systems */ + pg_num = DB_PTR2PG_NUM(rptr, db_pagesize); +#ifdef HAVE_64BIT_LONG + pg_off = rptr % db_pagesize; +#else + pg_off = rptr - pg_num*(DB_PTR)db_pagesize; +#endif + + /* do not go past the end of a buffer */ + if (tgt_len+pg_off > db_pagesize) { + db_failure(__LINE__,__FILE__, EX_DATAERR, emsg, + "invalid database address "L_HPAT + " or length %#x in %s", + rptr, tgt_len, db_nm); + return 0; + } + + b = find_st_buf(emsg, DB_BUF_TYPE_DB, st, pg_num, extend); + if (!b) + return 0; + st->s.rptr = rptr; + st->d.r = (DB_RCD *)&b->buf.c[pg_off]; + return 1; +} + + + +u_char /* 0=failed, 1=got it */ +db_map_rcd(DCC_EMSG emsg, + DB_STATE *rcd_st, /* point this to the record */ + DB_PTR rptr, /* that is here */ + int *rcd_lenp) /* put its length here */ +{ + u_int rcd_len; + + if (DB_PTR_IS_BAD(rptr)) { + dcc_pemsg(EX_DATAERR, emsg, + "getting bogus record at "L_HPAT", in %s", + rptr, db_nm); + return 0; + } + + if (!map_db(emsg, rptr, DB_RCD_HDR_LEN, rcd_st, 0)) + return 0; + rcd_len = DB_RCD_LEN(rcd_st->d.r); + + if (&rcd_st->d.c[rcd_len] > &rcd_st->b->buf.c[db_pagesize]) { + dcc_pemsg(EX_DATAERR, emsg, + "invalid checksum count %d at "L_HPAT" in %s", + DB_NUM_CKS(rcd_st->d.r), rptr, db_nm); + return 0; + } + + if (rcd_lenp) + *rcd_lenp = rcd_len; + return 1; +} + + + +/* write the new sizes of the files into the files */ +static u_char +db_set_sizes(DCC_EMSG emsg) +{ + u_char result = 1; + + if (db_hash_fd != -1 + && (db_csize_stored_hash != db_csize + || db_hash_used_stored_hash != db_hash_used)) { + if (!map_hash_ctl(emsg, 0)) { + result = 0; + } else { + db_sts.hash_ctl.d.vals->s.db_csize = db_csize; + db_csize_stored_hash = db_csize; + + db_sts.hash_ctl.d.vals->s.used = db_hash_used; + db_hash_used_stored_hash = db_hash_used; + + SET_FLUSH_HCTL(0); + } + } + + if (db_fd != -1 + && (db_parms_stored.db_csize != db_csize + || db_parms_stored.hash_used != db_hash_used)) { + if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0)) { + result = 0; + } else { + db_sts.db_parms.d.parms->db_csize = db_csize; + db_parms_stored.db_csize = db_csize; + db_parms.db_csize = db_csize; + + db_sts.db_parms.d.parms->hash_used = db_hash_used; + db_parms_stored.hash_used = db_hash_used; + db_parms.hash_used = db_hash_used; + + db_sts.db_parms.d.parms->last_rate_sec = db_time.tv_sec; + db_parms_stored.last_rate_sec = db_time.tv_sec; + db_parms.last_rate_sec = db_time.tv_sec; + + db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS)); + } + } + + return result; +} + + + +/* write the database parameters into the magic number headers of the files */ +u_char +db_flush_parms(DCC_EMSG emsg) +{ + if (!db_set_sizes(emsg)) + return 0; + + if (db_fd == -1) + return 1; + + if (memcmp(&db_parms, &db_parms_stored, sizeof(db_parms))) { + if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0)) + return 0; + + db_parms.pagesize = db_pagesize; + + *db_sts.db_parms.d.parms = db_parms; + db_parms_stored = db_parms; + + db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS)); + } + + return 1; +} + + + +/* find a checksum in an already mapped record */ +DB_RCD_CK * /* 0=not found, 1=broken database */ +db_find_ck(DCC_EMSG emsg, + DB_RCD *rcd, + DB_PTR rptr, + DCC_CK_TYPES type) /* find this type of checksum */ +{ + DB_RCD_CK *rcd_ck; + int i; + + rcd_ck = rcd->cks; + i = DB_NUM_CKS(rcd); + if (i >= DCC_NUM_CKS) { + dcc_pemsg(EX_DATAERR, emsg, + "impossible %d checksums in "L_HPAT" in %s", + i, rptr, db_nm); + return (DB_RCD_CK *)1; + } + + for (; i != 0; --i, ++rcd_ck) { + if (DB_CK_TYPE(rcd_ck) == type) + return rcd_ck; + } + + return 0; +} + + + +/* find a checksum type known to be in a record */ +DB_RCD_CK * /* 0=it's not there */ +db_map_rcd_ck(DCC_EMSG emsg, + DB_STATE *rcd_st, /* point this to the record */ + DB_PTR rptr, /* that is here */ + DCC_CK_TYPES type) /* find this type of checksum */ +{ + DB_RCD_CK *rcd_ck; + + if (!db_map_rcd(emsg, rcd_st, rptr, 0)) + return 0; + + rcd_ck = db_find_ck(emsg, rcd_st->d.r, rptr, type); + if (rcd_ck == (DB_RCD_CK *)1) + return 0; + if (rcd_ck == 0) { + dcc_pemsg(EX_DATAERR, emsg, + "missing \"%s\" checksum in "L_HPAT" in %s", + DB_TYPE2STR(type), rptr, db_nm); + return 0; + } + return rcd_ck; +} + + + +static inline u_char /* 1=has a small prime factor */ +modulus_has_divisor(DB_HADDR len) +{ + static int primes[] = { + 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, + 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, + 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, + 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, + 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, + 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, + 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499}; + int *p; + + for (p = &primes[0]; p <= LAST(primes); ++p) { + if ((len % *p) == 0) + return 1; + } + return 0; +} + + + +/* Get a modulus for the hash function that is tolerably likely to be + * relatively prime to most inputs. The worst that happens when the modulus + * is composite is that large multiples of its factors will suffer more + * collisions. */ +DB_HADDR +get_db_hash_divisor(DB_HADDR len) +{ + DB_HADDR divisor; + + divisor = len - DB_HADDR_BASE; + if (!(divisor & 1)) + --divisor; + while (divisor >= MIN_HASH_ENTRIES) { + if (modulus_has_divisor(divisor)) + divisor -= 2; + else + break; + } + return divisor; +} + + + +DB_HADDR +db_hash(DCC_CK_TYPES type, const DCC_SUM sum) +{ + u_int64_t accum, wrap; + const u_int32_t *wp; + union { + DCC_SUM sum; + u_int32_t words[4]; + } buf; + int align; + DB_HADDR haddr; + +#ifdef HAVE_64BIT_PTR + align = (u_int64_t)sum & 3; +#else + align = (u_int)sum & 3; +#endif + if (align == 0) { + /* We almost always take this branch because database + * records contain 12+N*24 bytes. That also implies that + * we should not hope for better than 4 byte alignment. */ + wp = (u_int32_t *)sum; + } else { + memcpy(buf.sum, sum, sizeof(buf.sum)); + wp = buf.words; + } + + /* MD5 checksums are uniformly distributed, and so DCC_SUMs are + * directly useful for hashing except when they are server-IDs */ + accum = *wp++; + accum += *wp++; + wrap = accum >>32; + accum <<= 32; + accum += wrap + type; + accum += *wp++; + accum += *wp; + + haddr = accum % db_hash_divisor; + haddr += DB_HADDR_BASE; + + /* do not hash into the last slot of a page, because it is used to + * find local free slots */ + if (haddr % db_hash_page_len == db_hash_page_len-1) { + ++haddr; + if (haddr >= db_hash_len) + haddr = DB_HADDR_BASE; + } + return haddr; +} + + + +/* look for a checksum in the hash table + * return with an excuse, the home slot, or the last entry on + * the collision chain */ +DB_FOUND +db_lookup(DCC_EMSG emsg, DCC_CK_TYPES type, const DCC_SUM sum, + DB_HADDR lo, /* postpone if out of this window */ + DB_HADDR hi, + DB_STATE *hash_st, /* hash block for record or related */ + DB_STATE *rcd_st, /* put the record or garbage here */ + DB_RCD_CK **prcd_ck) /* point to cksum if found */ +{ + DB_HADDR haddr, haddr_fwd, haddr_bak; + DB_PTR db_ptr; + DB_RCD_CK *found_ck; + DB_HADDR failsafe; + + haddr = db_hash(type, sum); + if (haddr < lo || haddr > hi) { + if (lo == 0 && hi == MAX_HASH_ENTRIES) { + dcc_pemsg(EX_DATAERR, emsg, + "out of range hash address"); + return DB_FOUND_SYSERR; + } + return DB_FOUND_LATER; + } + + if (prcd_ck) + *prcd_ck = 0; + + if (!map_hash(emsg, haddr, hash_st, 0)) + return DB_FOUND_SYSERR; + + if (HE_IS_FREE(hash_st->d.h)) + return DB_FOUND_EMPTY; + + if (!DB_HADDR_C_NULL(hash_st->d.h->bak)) + return DB_FOUND_INTRUDER; + + /* We know that the current hash table entry is in its home slot. + * It might be for the key or checksum we are looking for + * or it might be for some other checksum with the same hash value. */ + for (failsafe = 0; failsafe <= db_hash_len; ++failsafe) { + if (HE_CMP(hash_st->d.h, type, sum)) { + /* This hash table entry could be for our target + * checksum. Read the corresponding record so we + * decide whether we have a hash collision or we + * have found a record containing our target checksum. + * + * find right type of checksum in the record */ + db_ptr = DB_HPTR_EX(hash_st->d.h->rcd); + found_ck = db_map_rcd_ck(emsg, rcd_st, db_ptr, type); + if (!found_ck) + return DB_FOUND_SYSERR; + if (!memcmp(sum, found_ck->sum, + sizeof(DCC_SUM))) { + if (prcd_ck) + *prcd_ck = found_ck; + return DB_FOUND_IT; + } + } + + /* This DB record was a hash collision, or for a checksum + * other than our target. + * Fail if this is the end of the hash chain */ + haddr_fwd = DB_HADDR_EX(hash_st->d.h->fwd); + if (haddr_fwd == DB_HADDR_NULL) + return DB_FOUND_CHAIN; + + if (DB_HADDR_INVALID(haddr_fwd)) { + dcc_pemsg(EX_DATAERR, emsg, + "broken hash chain fwd-link" + " #%d %#x at %#x in %s", + failsafe, haddr_fwd, haddr, db_hash_nm); + return DB_FOUND_SYSERR; + } + + if (!map_hash(emsg, haddr_fwd, hash_st, 0)) + return DB_FOUND_SYSERR; + + haddr_bak = DB_HADDR_EX(hash_st->d.h->bak); + if (haddr_bak != haddr) { + dcc_pemsg(EX_DATAERR, emsg, + "broken hash chain links #%d," + " %#x-->%#x but %#x<--%#x in %s", + failsafe, + haddr, haddr_fwd, + haddr_bak, haddr_fwd, + db_hash_nm); + return DB_FOUND_SYSERR; + } + haddr = haddr_fwd; + } + dcc_pemsg(EX_DATAERR, emsg, "infinite hash chain at %#x in %s", + haddr, db_hash_nm); + return DB_FOUND_SYSERR; +} + + + +/* combine checksums */ +DCC_TGTS +db_sum_ck(DCC_TGTS prev, /* previous sum */ + DCC_TGTS rcd_tgts, /* from the record */ + DCC_CK_TYPES type UATTRIB) +{ + DCC_TGTS res; + + /* This arithmetic must be commutative (after handling deleted + * values), because inter-server flooding causes records to appear in + * the database out of temporal order. + * + * DCC_TGTS_TOO_MANY can be thought of as a count of plus infinity. + * DCC_TGTS_OK is like minus infinity. + * DCC_TGTS_OK2 like half of minus infinity + * DCC_TGTS_TOO_MANY (plus infinity) added to DCC_TGTS_OK (minus + * infinity) or DCC_TGTS_OK2 yields DCC_TGTS_OK or DCC_TGTS_OK2. + * + * Reputations never reach infinity. + * + * Claims of not-spam from all clients are discarded as they arrive + * and before here. They can only come from the local white list + */ +#define SUM_OK_DEL(p,r) { \ + if (rcd_tgts == DCC_TGTS_OK || prev == DCC_TGTS_OK) \ + return DCC_TGTS_OK; \ + if (rcd_tgts == DCC_TGTS_OK2 || prev == DCC_TGTS_OK2) \ + return DCC_TGTS_OK2; \ + if (rcd_tgts == DCC_TGTS_DEL) \ + return prev; \ + } + + res = prev+rcd_tgts; + if (res <= DCC_TGTS_TOO_MANY) + return res; + + SUM_OK_DEL(prev, rcd_tgts); + return DCC_TGTS_TOO_MANY; +#undef SUM_OK_DEL +} + + + +/* delete all reports that contain the given checksum */ +static u_char /* 1=done, 0=broken database */ +del_ck(DCC_EMSG emsg, + DCC_TGTS *res, /* residual targets after deletion */ + const DB_RCD *new, /* delete reports older than this one */ + DCC_CK_TYPES type, /* delete this type of checksum */ + DB_RCD_CK *prev_ck, /* starting with this one */ + DB_STATE *prev_st) /* use this scratch state block */ +{ + DB_PTR prev; + + *res = 0; + for (;;) { + /* delete reports that are older than the delete request */ + if (dcc_ts_newer_ts(&new->ts, &prev_st->d.r->ts) + && DB_RCD_ID(prev_st->d.r) != DCC_ID_WHITE) { + DB_TGTS_RCD_SET(prev_st->d.r, 0); + DB_TGTS_CK_SET(prev_ck, 0); + SET_FLUSH_RCD(prev_st, 1); + + } else { + /* sum reports that are not deleted */ + *res = db_sum_ck(*res, DB_TGTS_RCD(prev_st->d.r), type); + } + + prev = DB_PTR_EX(prev_ck->prev); + if (prev == DB_PTR_NULL) + return 1; + prev_ck = db_map_rcd_ck(emsg, prev_st, prev, type); + if (!prev_ck) + return 0; + } +} + + + +/* see if the new and preceding records are from the same era */ +static inline u_char /* 1=different eras */ +ck_old_spam(const DB_RCD *new, const DCC_TS* prev, DCC_CK_TYPES type) +{ + struct timeval tv; + time_t secs; + DCC_TS past; + + secs = db_parms.ex_secs[type].spam; + if (secs > DCC_OLD_SPAM_SECS) + secs = DCC_OLD_SPAM_SECS; + dcc_ts2timeval(&tv, &new->ts); + dcc_timeval2ts(&past, &tv, -secs); + + return dcc_ts_older_ts(prev, &past); +} + + + +/* Mark reports made obsolete by a spam report + * A new report of spam makes sufficiently old reports obsolete. + * + * Sufficiently recent non-obsolete reports make a new report obsolete, + * or at least not worth spending bandwidth to flood. + * "Sufficiently recent" should be defined so that this server and + * its downstream flooding peers always have reports of the checksums + * in the report. So we want to keep (not make obsolete) at least one + * report per expiration duration. We cannot know the expiration durations + * of our peers, but we known DB_EXPIRE_SPAMSECS_DEF_MIN which influences + * DCC_OLD_SPAM_SECS. + * + * However, if another checksum in the new report was kept, then + * prefer marking old checksums obsolete. + * + * db_sts.rcd points to the new record + * db_sts.rcd2 points the the previous record and is changed + */ +static u_char /* 1=done, 0=broken database */ +ck_obs_spam(DCC_EMSG emsg, + const DB_RCD *new, + DCC_TGTS new_tgts, + DB_RCD_CK *new_ck, + DCC_CK_TYPES type, /* check this type of checksum */ + DB_RCD_CK *prev_ck, /* starting with this one */ + DCC_TGTS prev_ck_tgts, + u_char *keeping_new) /* 1=already keeping the new record */ +{ + int limit; + DB_PTR prev; + + limit = 100; + for (;;) { + /* preceding white listed entries make new entries obsolete */ + if (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE) { + new_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd, 1); + return 1; + } + + if (DB_CK_OBS(prev_ck) + || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) { + /* notice duplicates and + * don't look forever for recent non-obsolete report */ + if (!memcmp(&new->ts, &db_sts.rcd2.d.r->ts, + sizeof(new->ts)) + || --limit == 0) { + *keeping_new = 1; + return 1; + } + + } else if (prev_ck_tgts != DCC_TGTS_TOO_MANY) { + /* Mark this predecessor obsolete because it + * was before the checksum became spam. */ + prev_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd2, 0); + + /* continue backwards to mark more non-spam + * predecessors obsolete */ + + } else if (!*keeping_new + && ck_old_spam(new, &db_sts.rcd2.d.r->ts, type)) { + /* We do not yet have a reason to keep the new report + * and this predecessor is at or after a spam report. + * We need the new report because it and the + * predecessor are from different eras. + * If the new report is not of spam, it will be + * compressed with a preceding spam report. */ + *keeping_new = 1; + /* The predecessor is not needed if the new record + * is for spam */ + if (new_tgts == DCC_TGTS_TOO_MANY) { + prev_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd2, 0); + } + /* We're finished, because all older preceding reports + * were marked obsolete when this older predecessor + * was linked. */ + return 1; + + } else { + /* this predecessor is about as recent as the new + * record, so the new record is unneeded noise that + * would bloat other servers' databases. */ + new_ck->type_fgs |= DB_CK_FG_OBS; + return 1; + } + + prev = DB_PTR_EX(prev_ck->prev); + if (prev == DB_PTR_NULL) { + /* the new record is a new report of spam */ + *keeping_new = 1; + return 1; + } + + prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2, prev, type); + if (!prev_ck) + return 0; + prev_ck_tgts = DB_TGTS_CK(prev_ck); + } +} + + + +/* mark extra server-ID declarations obsolete + * + * db_sts.rcd points to the new record + * db_sts.rcd2 points the the previous record and is changed */ + static u_char /* 1=done, 0=broken database */ +srvr_id_ck(DCC_EMSG emsg, + const DB_RCD *new, + DB_RCD_CK *new_ck, + DB_RCD_CK *prev_ck) /* starting with this one */ +{ + DB_PTR prev; + DCC_SRVR_ID new_id, prev_id; + struct timeval tv; + DCC_TS week_ts; + + dcc_ts2timeval(&tv, &new->ts); + tv.tv_usec = 0; + tv.tv_sec -= tv.tv_sec % (7*24*60*60); + dcc_timeval2ts(&week_ts, &tv, 0); + + new_id = DB_RCD_ID(new); + for (;;) { + /* mark duplicate older declarations and deletions obsolete */ + prev_id = DB_RCD_ID(db_sts.rcd2.d.r); + if (!DCC_ID_SRVR_TYPE(prev_id) + || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) { + if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) { + new_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd, 1); + } else { + prev_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd2, 1); + } + return 1; + } + + /* Keep many identical type declarations as a kludge to ensure + * that rewound flooding sends type declarations early. + * Keep only one delcaration per week. */ + if (DCC_ID_SRVR_TYPE(new_id)) { + /* Zap the new declaration and stop if the + * new declaration is older than the predecessor. */ + if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) { + new_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd, 1); + return 1; + } + + /* Stop when we find a duplicate type declaration + * of a different week */ + if (prev_id == new_id + && dcc_ts_older_ts(&db_sts.rcd2.d.r->ts, + &week_ts)) { + return 1; + } + + /* continue zapping preceding declarations */ + prev_ck->type_fgs |= DB_CK_FG_OBS; + SET_FLUSH_RCD(&db_sts.rcd2, 1); + } + + prev = DB_PTR_EX(prev_ck->prev); + if (prev == DB_PTR_NULL) + return 1; + + prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2, + prev, DCC_CK_SRVR_ID); + if (!prev_ck) + return 0; + } +} + + + +/* Install pointers in the hash table for a record and fix the accumulated + * counts in the record pointed to by db_sts.rcd + * Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp + * The caller must deal with db_make_dirty() */ +u_char /* 0=failed, 1=done */ +db_link_rcd(DCC_EMSG emsg, DB_HADDR lo, DB_HADDR hi) +{ + DCC_TGTS res; + DB_RCD *rcd; + DB_RCD_CK *prev_ck; + DB_RCD_CK *rcd_ck; + DCC_CK_TYPES rcd_type; + DCC_TGTS rcd_tgts, prev_ck_tgts; + int ck_num; + DB_HADDR haddr; + u_char keeping_new; + + keeping_new = 0; + rcd = db_sts.rcd.d.r; + rcd_tgts = DB_TGTS_RCD_RAW(rcd); + rcd_ck = rcd->cks; + ck_num = DB_NUM_CKS(rcd); + if (ck_num > DIM(rcd->cks)) { + dcc_pemsg(EX_OSFILE, emsg, + "bogus checksum count %#x at "L_HPAT" in %s", + rcd->fgs_num_cks, db_sts.rcd.s.rptr, db_nm); + return 0; + } + for (; ck_num > 0; --ck_num, ++rcd_ck) { + rcd_type = DB_CK_TYPE(rcd_ck); + if (!DCC_CK_OK_DB(grey_on, rcd_type)) { + dcc_pemsg(EX_OSFILE, emsg, + "invalid checksum type %s at "L_HPAT" in %s", + DB_TYPE2STR(rcd_type), + db_sts.rcd.s.rptr, db_nm); + return 0; + } + + rcd_ck->prev = DB_PTR_CP(DB_PTR_NULL); + + /* Do not link paths or whitelist file and line numbers */ + if (rcd_type == DCC_CK_FLOD_PATH) { + DB_TGTS_CK_SET(rcd_ck, 0); + continue; + } + + /* Do not link or total some checksums unless they are + * whitelist entries. If they are whitelist entries, they + * will eventually get set to DCC_TGTS_OK or DCC_TGTS_OK2. + * Blacklist entries are noticed later by server-ID + * or do not matter DCC_TGTS_TOO_MANY. */ + if (DB_TEST_NOKEEP(db_parms.nokeep_cks, rcd_type) + && DB_RCD_ID(rcd) != DCC_ID_WHITE) { + DB_TGTS_CK_SET(rcd_ck, 1); + continue; + } + + res = (rcd_tgts == DCC_TGTS_DEL) ? 0 : rcd_tgts; + + switch (db_lookup(emsg, rcd_type, rcd_ck->sum, lo, hi, + &db_sts.hash, &db_sts.rcd2, &prev_ck)) { + case DB_FOUND_SYSERR: + return 0; + + case DB_FOUND_LATER: + continue; + + case DB_FOUND_IT: + /* We found the checksum + * Update the hash table to point to the new record */ + DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr); + SET_FLUSH_HE(&db_sts.hash); + /* link new record to existing record */ + rcd_ck->prev = DB_PTR_CP(db_sts.rcd2.s.rptr); + + /* delete predecessors to a delete request + * and compute the remaining sum */ + if (rcd_tgts == DCC_TGTS_DEL) { + if (!del_ck(emsg, &res, rcd, rcd_type, + prev_ck, &db_sts.rcd2)) + return 0; + /* delete requests are obsolete if the + * checksum is whitelisted */ + if (res == DCC_TGTS_OK + || res == DCC_TGTS_OK2) + rcd_ck->type_fgs |= DB_CK_FG_OBS; + break; + } + + /* Simple checksum with a predecessor + * This does not do the substantial extra work + * to notice all delete requests that arrived early. + * That problem is handled by the incoming flood + * duplicate report detection mechanism. + * We must detect precessors that were deleted because + * they are partial duplicates of the new record. */ + prev_ck_tgts = DB_TGTS_CK(prev_ck); + if (DB_RCD_SUMRY(rcd)) + res = prev_ck_tgts; + else + res = db_sum_ck(prev_ck_tgts, res, rcd_type); + if ((res == DCC_TGTS_OK || res == DCC_TGTS_OK2 + || (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE)) + && DB_RCD_ID(rcd) != DCC_ID_WHITE){ + /* obsolete whitelisted checksums */ + rcd_ck->type_fgs |= DB_CK_FG_OBS; + break; + } + if (res == DCC_TGTS_TOO_MANY) { + /* mark obsolete unneeded reports of spam */ + if (!DB_CK_OBS(rcd_ck) + && !ck_obs_spam(emsg, rcd, rcd_tgts, + rcd_ck, rcd_type, + prev_ck, prev_ck_tgts, + &keeping_new)) + return 0; /* (broken database) */ + } else if (rcd_type == DCC_CK_SRVR_ID) { + /* mark obsolete server-ID assertions */ + if (!DB_CK_OBS(rcd_ck) + && !srvr_id_ck(emsg, rcd, rcd_ck, prev_ck)) + return 0; /* (broken database) */ + } + break; + + case DB_FOUND_EMPTY: + /* We found an empty hash table slot. + * Update the slot to point to our new record + * after removing it from the free list, + * which marks it dirty. */ + if (!unlink_free_hash(emsg, &db_sts.hash)) + return 0; + DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr); + HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum); + if (res >= BULK_THRESHOLD) + keeping_new = 1; + break; + + case DB_FOUND_CHAIN: + /* We found a hash collision, a chain of 1 or more + * records with the same hash value. + * Get a free slot, link it to the end of the + * existing chain, and point it to the new record. + * The buffer containing the free slot is marked + * dirty when it is removed from the free list. */ + if (!get_free_hash(emsg, db_sts.hash.s.haddr)) + return 0; + DB_HADDR_CP(db_sts.free.d.h->bak, db_sts.hash.s.haddr); + DB_HADDR_CP(db_sts.hash.d.h->fwd, db_sts.free.s.haddr); + DB_HPTR_CP(db_sts.free.d.h->rcd, db_sts.rcd.s.rptr); + HE_MERGE(db_sts.free.d.h,rcd_type, rcd_ck->sum); + SET_FLUSH_HE(&db_sts.hash); + if (res >= BULK_THRESHOLD) + keeping_new = 1; + break; + + case DB_FOUND_INTRUDER: + /* The home hash slot for our key contains an + * intruder. Move it to a new free slot */ + if (!get_free_hash(emsg, db_sts.hash.s.haddr)) + return 0; + *db_sts.free.d.h = *db_sts.hash.d.h; + /* re-link the neighbors of the intruder */ + haddr = DB_HADDR_EX(db_sts.free.d.h->bak); + if (haddr == DB_HADDR_NULL) { + dcc_pemsg(EX_DATAERR, emsg, + "bad hash chain reverse link at %#x" + " in %s", + haddr, db_hash_nm); + return 0; + } + if (!map_hash(emsg, haddr, &db_sts.tmp, 0)) + return 0; + DB_HADDR_CP(db_sts.tmp.d.h->fwd, db_sts.free.s.haddr); + SET_FLUSH_HE(&db_sts.tmp); + haddr = DB_HADDR_EX(db_sts.hash.d.h->fwd); + if (haddr != DB_HADDR_NULL) { + if (!map_hash(emsg, haddr, &db_sts.tmp, 0)) + return 0; + DB_HADDR_CP(db_sts.tmp.d.h->bak, + db_sts.free.s.haddr); + SET_FLUSH_HE(&db_sts.tmp); + } + /* install the new entry in its home slot */ + DB_HADDR_CP(db_sts.hash.d.h->fwd, DB_HADDR_NULL); + DB_HADDR_CP(db_sts.hash.d.h->bak, DB_HADDR_NULL); + DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr); + HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum); + SET_FLUSH_HE(&db_sts.hash); + if (res >= BULK_THRESHOLD) + keeping_new = 1; + break; + } + + /* Fix the checksum's total in the record */ + DB_TGTS_CK_SET(rcd_ck, res); + SET_FLUSH_RCD(&db_sts.rcd, 0); + } + + return db_set_sizes(emsg); +} + + + +/* Add a record to the database and the hash table + * The record must be known to be valid + * Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp + * On exit db_sts.rcd points to the new record in the database */ +DB_PTR /* 0=failed */ +db_add_rcd(DCC_EMSG emsg, const DB_RCD *new_rcd) +{ + u_int new_rcd_len, pad_len; + DB_PTR new_db_csize, rcd_pos, new_page_num; + DB_BUF *b; + + if (!db_make_dirty(emsg)) + return 0; + + new_rcd_len = (sizeof(*new_rcd) + - sizeof(new_rcd->cks) + + (DB_NUM_CKS(new_rcd) * sizeof(new_rcd->cks[0]))); + + rcd_pos = db_csize; + new_db_csize = rcd_pos+new_rcd_len; + + new_page_num = DB_PTR2PG_NUM(new_db_csize, db_pagesize); + if (new_page_num == DB_PTR2PG_NUM(db_csize, db_pagesize)) { + if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0)) + return 0; + + } else { + /* fill with zeros to get past a page boundary. */ + pad_len = new_page_num*db_pagesize - db_csize; + pad_len = (((pad_len + DB_RCD_HDR_LEN-1) / DB_RCD_HDR_LEN) + * DB_RCD_HDR_LEN); + if (pad_len != 0) { + if (!map_db(emsg, db_csize, pad_len, &db_sts.rcd, 0)) + return 0; + memset(db_sts.rcd.d.r, 0, pad_len); + db_set_flush(&db_sts.rcd, 1, pad_len); + db_csize += pad_len; + + rcd_pos = db_csize; + new_db_csize = rcd_pos+new_rcd_len; + } + + /* extend the file by writing a full page to it with write(), + * because extending by mmap() often does not work */ + db_fsize = db_csize+db_pagesize; + if (!map_db(emsg, rcd_pos, db_pagesize, &db_sts.rcd, 1)) + return 0; + b = db_sts.rcd.b; + b->flush = (DB_BUF_FM)-1; + + /* push new page to disk if dblist or dbclean is running */ + if (db_minimum_map) { + rel_db_state(&db_sts.rcd); + if (!buf_munmap(emsg, b)) + return 0; + if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0)) + return 0; + } + } + + /* install the record */ + memcpy(db_sts.rcd.d.r, new_rcd, new_rcd_len); + /* Mark its buffer to be sent to the disk to keep the database + * as good as possible even if we crash. We don't need to worry + * about later changes to the hash links because dbclean will + * rebuild them if we crash */ + db_set_flush(&db_sts.rcd, 1, new_rcd_len); + db_csize = new_db_csize; + + /* install pointers in the hash table + * and update the total counts in the record */ + if (!db_link_rcd(emsg, 0, MAX_HASH_ENTRIES)) + return 0; + + ++db_stats.adds; + return rcd_pos; +}