view srvrlib/db.c @ 4:d329bb5c36d0

Changes making it compile the new upstream release
author Peter Gervai <grin@grin.hu>
date Tue, 10 Mar 2009 14:57:12 +0100
parents c7f6b056b673
children
line wrap: on
line source

/* Distributed Checksum Clearinghouse
 *
 * server database functions
 *
 * Copyright (c) 2008 by Rhyolite Software, LLC
 *
 * This agreement is not applicable to any entity which sells anti-spam
 * solutions to others or provides an anti-spam solution as part of a
 * security solution sold to other entities, or to a private network
 * which employs the DCC or uses data provided by operation of the DCC
 * but does not provide corresponding data to other users.
 *
 * Permission to use, copy, modify, and distribute this software without
 * changes for any purpose with or without fee is hereby granted, provided
 * that the above copyright notice and this permission notice appear in all
 * copies and any distributed versions or copies are either unchanged
 * or not called anything similar to "DCC" or "Distributed Checksum
 * Clearinghouse".
 *
 * Parties not eligible to receive a license under this agreement can
 * obtain a commercial license to use DCC by contacting Rhyolite Software
 * at sales@rhyolite.com.
 *
 * A commercial license would be for Distributed Checksum and Reputation
 * Clearinghouse software.  That software includes additional features.  This
 * free license for Distributed ChecksumClearinghouse Software does not in any
 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
 * software
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Rhyolite Software DCC 1.3.103-1.214 $Revision$
 */

#include "srvr_defs.h"
#include <syslog.h>
#include <sys/resource.h>
#if defined(HAVE_HW_PHYSMEM) || defined(HAVE_BOOTTIME)
#include <sys/sysctl.h>
#endif
#ifdef HAVE_PSTAT_GETSTATIC /* HP-UX */
#include <sys/pstat.h>
#endif

DB_STATS db_stats;

DB_STATES db_sts;

DCC_PATH db_path_buf;

int db_fd = -1;
DCC_PATH db_nm;
int db_hash_fd = -1;
DCC_PATH db_hash_nm;
struct timeval db_locked;		/* 1=database not locked */

struct timeval db_time;

int db_debug;

u_char grey_on;
static u_char db_use_write;		/* 0=no 1=if RAM big enough 2=always */
static u_char db_dirty;
static u_char db_rdonly;
int db_failed_line;			/* bad happened at this line # */
const char *db_failed_file;		/*	in this file */
static u_char db_invalidate;		/* do not write to the files */

/* Without mmap(MAP_NOSYNC) as on Solaris or a good msync() as on BSD/OS,
 * we must rely on the kernel's update/syncer/bufdaemon/etc.  So in this
 * case just fondle the mmap()'ed pages and hope things work out.
 *
 * With a msync() and with mmap(MAP_NOSYNC), use MAP_NOSYNC if we can because
 * some systems flush too quickly while others such as FreeBSD 6.1 stall
 * for seconds while thinking about flushing the database.
 * But with mmap(MAP_NOSYNC) we leave large amounts of data in RAM that take
 * too long time to be pushed to the disk when the system is shutting down.
 * So
 *	- hit only those chunks of memory with real data or changes to data
 *	    with msync().  Trust dbclean to rebuild everything else at need.
 *
 *	- when it seems the system is being shut down, delete the hash table
 *	    and let it be rebuilt when the system is rebooted.  When the
 *	    hash table is rebuilt, "obsolete" markings in the data file that
 *	    might have been lost will be remade.
 *
 * A third case involves dccd -F.  It requires that all changes be pushed to
 * the disk whenever dccd unlocks the database so that dbclean can see changes
 * dccd makes.  It also requires that dbclean write all of its changes so
 * that dccd will find them when it reopens the database.
 */

#if !defined(MAP_NOSYNC) || defined(HAVE_OLD_MSYNC) || !defined(HAVE_BOOTTIME)
#undef USE_MAP_NOSYNC
#else
#define USE_MAP_NOSYNC
#endif

static u_char db_not_synced;		/* database unsynchronized with disk */


#define DCC_MADV_WILLNEED(p) 0
#ifdef MADV_WILLNEED
#undef DCC_MADV_WILLNEED
#define DCC_MADV_WILLNEED(p) madvise(p, db_pagesize, MADV_WILLNEED)
#endif
#ifdef POSIX_MADV_WILLNEED
#undef DCC_MADV_WILLNEED
#define DCC_MADV_WILLNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_WILLNEED)
#endif

#define DCC_MADV_RANDOM(p) 0
#ifdef MADV_RANDOM
#undef DCC_MADV_RANDOM
#define DCC_MADV_RANDOM(p) madvise(p, db_pagesize, MADV_RANDOM)
#endif
#ifdef POSIX_MADV_RANDOM
#undef DCC_MADV_RANDOM
#define DCC_MADV_RANDOM(p) posix_madvise(p, db_pagesize, POSIX_MADV_RANDOM)
#endif

#define DCC_MADV_DONTNEED(p) 0
/* The Linux people claim that it is just fine that their notion of
 * MADV_DONTNEED implies discarding changes to data.  Worse, some versions of
 * Linux/GNU libc define POSIX_MADV_DONTNEED as the data-corrupting Linux
 * MADV_DONTNEED.  This seems to be because they cannot admit their mistake of
 * not distinguishing between the functions of MADV_FREE and MADV_DONTNEED and
 * their misreading of other systems' documentation for MADV_DONTNEED */
#ifndef linux
#ifdef MADV_DONTNEED
#undef DCC_MADV_DONTNEED
#define DCC_MADV_DONTNEED(p) madvise(p, db_pagesize, MADV_DONTNEED)
#endif
#ifdef POSIX_MADV_DONTNEED
#undef DCC_MADV_DONTNEED
#define DCC_MADV_DONTNEED(p) posix_madvise(p, db_pagesize, POSIX_MADV_DONTNEED)
#endif
#endif /* !linux */

#define DCC_MADV_FREE(p) 0
#ifdef MADV_FREE
#undef DCC_MADV_FREE
#define DCC_MADV_FREE(p) madvise(p, db_pagesize, MADV_FREE)
#endif
#ifdef POSIX_MADV_FREE
#undef DCC_MADV_FREE
#define DCC_MADV_FREE(p) posix_madvise(p, db_pagesize, POSIX_MADV_FREE)
#endif


u_char db_minimum_map;			/* this is dccd & dbclean is running */

int db_buf_total;			/* total # of db buffers */
DB_PTR db_max_rss;			/* maximum db resident set size */
DB_PTR db_max_byte;			/* maximum db bytes in both files */

static u_int system_pagesize;		/* kernel page size */

static DB_BUF db_bufs[DB_BUF_MAX];	/* control mmap()'ed blocks */
static DB_BUF *buf_oldest, *buf_newest;

#define DB_HASH_TOTAL DB_BUF_MAX
static DB_BUF *db_buf_hash[DB_HASH_TOTAL];
/* fancy 16-bit multiplicative hash assumes multiplication needs 1 cycle
 * and so the hash is faster than dealing with a collision */
#define DB_BUF_HASH(pnum,t) (&db_buf_hash[((((pnum)*(t)*0x9ccf) & 0xffff)   \
					   * DB_BUF_MAX) >> 16])

time_t db_need_flush_secs;
static time_t db_urgent_need_flush_secs;

const DB_VERSION_BUF db_version_buf = DB_VERSION_STR;
DB_PARMS db_parms;
static DB_PARMS db_parms_stored;

DCC_TGTS db_tholds[DCC_DIM_CKS];

u_int db_pagesize;			/* size of 1 mmap()'ed buffer */
static u_int db_pagesize_part;

DB_HOFF db_hash_fsize;			/* size of hash table file */
static u_int hash_clear_pg_num;
DB_HADDR db_hash_len;			/* # of hash table entries */
DB_HADDR db_hash_divisor;		/* modulus */
DB_HADDR db_hash_used;			/* # of hash table entries in use */
u_int db_hash_page_len;			/* # of HASH_ENTRY's per buffer */
DB_HADDR db_max_hash_entries = 0;	/* after db_buf_init()*/
DB_PTR db_fsize;				/* size of database file */
DB_PTR db_csize;			/* size of database contents in bytes */
static DB_PTR db_csize_stored_hash;	/* DB size stored in hash file */
static DB_HADDR db_hash_used_stored_hash;
u_int db_page_max;			/* only padding after this in DB buf */
static DB_PTR db_window_size;		/* size of mmap() window */
char db_window_size_str[128];
static char db_physmem_str[80];

static const u_char dcc_ck_fuzziness[DCC_DIM_CKS] = {
	0,				/* DCC_CK_INVALID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_IP */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_ENV_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_SUB */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_MESSAGE_ID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_RECEIVED */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_BODY */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_FUZ1 */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_FUZ2 */
	DCC_CK_FUZ_LVL_REP,		/* DCC_CK_REP_TOTAL */
	DCC_CK_FUZ_LVL_REP,		/* DCC_CK_REP_BULK */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_SRVR_ID */
	DCC_CK_FUZ_LVL2			/* DCC_CK_ENV_TO */
};
static const u_char grey_ck_fuzziness[DCC_DIM_CKS] = {
	0,				/* DCC_CK_INVALID */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_IP */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_ENV_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_SUB */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_MESSAGE_ID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_RECEIVED */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_BODY */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FUZ1 */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FUZ2 */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_GREY_MSG */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_GREY_TRIPLE */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_SRVR_ID */
	DCC_CK_FUZ_LVL1			/* DCC_CK_ENV_TO */
};
const u_char *db_ck_fuzziness = dcc_ck_fuzziness;


static u_char buf_flush(DCC_EMSG, DB_BUF *, u_char);
static u_char buf_munmap(DCC_EMSG, DB_BUF *);
static DB_BUF *find_buf(DCC_EMSG, DB_BUF_TYPE, DB_PG_NUM);
static u_char map_hash(DCC_EMSG, DB_HADDR, DB_STATE *, u_char);
static u_char map_hash_ctl(DCC_EMSG, u_char);
static u_char map_db(DCC_EMSG, DB_PTR, u_int, DB_STATE *, u_char);
static u_char db_set_sizes(DCC_EMSG);


/* compute the least common multiple of two numbers */
static u_int
lcm(u_int n, u_int m)
{
	u_int r, x, gcd;

	/* first get the gcd of the two numbers */
	if (n >= m) {
		x = n;
		gcd = m;
	} else {
		x = m;
		gcd = n;
	}
	for (;;) {
		r = x % gcd;
		if (r == 0)
			return n * (m / gcd);
		x = gcd;
		gcd = r;
	}
}



const char *
db_ptr2str(DB_PTR val)
{
	static int bufno;
	static struct {
	    char    str[16];
	} bufs[4];
	char *s;
	const char *units;

	if (val == 0)
		return "0";

	s = bufs[bufno].str;
	bufno = (bufno+1) % DIM(bufs);

	if (val % (1024*1024*1024) == 0) {
		val /= (1024*1024*1024);
		units = "GB";
	} else if (val % (1024*1024) == 0) {
		val /= (1024*1024);
		units = "MB";
	} else if (val % 1024 == 0) {
		val /= 1024;
		units = "KB";
	} else {
		units = "";
	}
	if (val > 1000*1000*1000)
		snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d,%03d%s",
			 (int)(val / (1000*1000*1000)),
			 (int)(val / (1000*1000)) % 1000,
			 (int)(val / 1000) % 1000,
			 (int)(val % 1000),
			 units);
	else if (val > 1000*1000)
		snprintf(s, sizeof(bufs[0].str), "%d,%03d,%03d%s",
			 (int)(val / (1000*1000)),
			 (int)(val / 1000) % 1000,
			 (int)(val % 1000),
			 units);
	else if (val > 1000*10)
		snprintf(s, sizeof(bufs[0].str), "%d,%03d%s",
			 (int)(val / 1000),
			 (int)(val % 1000),
			 units);
	else
		snprintf(s, sizeof(bufs[0].str), "%d%s",
			 (int)val,
			 units);
	return s;
}



const char *
size2str(char *buf, u_int buf_len,
	 double num, u_char bytes_or_entries)	/* 0=number 1=bytes */
{
	const char *units;
	double k;

	k = bytes_or_entries ? 1024.0 : 1000.0;

	if (num < k) {
		units = "";
	} else if (num < k*k) {
		num /= k;
		units = "K";
	} else if (num < k*k*k) {
		num /= k*k;
		units = "M";
	} else {
		num /= k*k*k;
		units = "G";
	}

	if ((int)num >= 100)
		snprintf(buf, buf_len, "%.0f%s", num, units);
	else
		snprintf(buf, buf_len, "%.2g%s", num, units);
	return buf;
}



void PATTRIB(5,6)
db_failure(int linenum, const char *file, int ex_code, DCC_EMSG emsg,
	   const char *p, ...)
{
	va_list args;

	if (!db_failed_line) {
		db_failed_line = linenum;
		db_failed_file = file;
	}
	va_start(args, p);
	dcc_vpemsg(ex_code, emsg, p, args);
	va_end(args);
}



void PATTRIB(3,4)
db_error_msg(int linenum, const char *file, const char *p, ...)
{
	va_list args;

	if (!db_failed_line) {
		db_failed_line = linenum;
		db_failed_file = file;
	}
	va_start(args, p);
	dcc_verror_msg(p, args);
	va_end(args);
}



double					/* hashes or bytes/second */
db_add_rate(const DB_PARMS *parms,
	    u_char hash_or_db)		/* 1=hash */
{
	struct timeval sn;
	time_t new_rate_secs;
	time_t total_secs;
	double added, cur, prev;

	total_secs = parms->rate_secs;
	if (hash_or_db) {
		added = parms->hash_added;
		cur = parms->hash_used;
		prev = parms->old_hash_used;
	} else {
		added = parms->db_added;
		cur = parms->db_csize;
		prev = parms->old_db_csize;
	}

	if (total_secs <= 0 || total_secs > DB_MAX_RATE_SECS
	    || added <= 0.0) {
		added = 0.0;
		total_secs = 0;
	}

	dcc_ts2timeval(&sn, &parms->sn);
	new_rate_secs = parms->last_rate_sec - sn.tv_sec;
	if (new_rate_secs > 0 && new_rate_secs <= DB_MAX_RATE_SECS
	    && cur > prev) {
		total_secs += new_rate_secs;
		added += cur - prev;
	}

	if (total_secs <= DB_MIN_RATE_SECS)
		return -1.0;
	return added / total_secs;
}



DB_NOKEEP_CKS
def_nokeep_cks(void)
{
	DCC_CK_TYPES type;
	DB_NOKEEP_CKS nokeep = 0;

	for (type = DCC_CK_TYPE_FIRST; type <= DCC_CK_TYPE_LAST; ++type) {
		if (DB_GLOBAL_NOKEEP(grey_on, type))
			DB_SET_NOKEEP(nokeep, type);
	}
	DB_SET_NOKEEP(nokeep, DCC_CK_INVALID);
	DB_SET_NOKEEP(nokeep, DCC_CK_FLOD_PATH);

	return nokeep;
}



void
set_db_tholds(DB_NOKEEP_CKS nokeep)
{
	DCC_CK_TYPES type;

	for (type = 0; type < DIM(db_tholds); ++type) {
		db_tholds[type] = (DB_TEST_NOKEEP(nokeep, type)
				   ? DCC_TGTS_INVALID
				   : DCC_CK_IS_REP_CMN(grey_on, type)
				   ? DCC_TGTS_INVALID
				   : grey_on ? 1
				   : type == DCC_CK_SRVR_ID ? 1
				   : BULK_THRESHOLD);
	}
}



static const char *
buf2path(const DB_BUF *b)
{
	switch (b->buf_type) {
	case DB_BUF_TYPE_HASH:
		return db_hash_nm;
	case DB_BUF_TYPE_DB:
		return db_nm;
	case DB_BUF_TYPE_FREE:
	default:
		dcc_logbad(EX_SOFTWARE, "impossible buffer type for a path");
	}
}



static int
buf2fd(const DB_BUF *b)
{
	switch (b->buf_type) {
	case DB_BUF_TYPE_HASH:
		return db_hash_fd;
	case DB_BUF_TYPE_DB:
		return db_fd;
	case DB_BUF_TYPE_FREE:
	default:
		dcc_logbad(EX_SOFTWARE, "impossible buffer type for fd");
	}
}



static void
rel_db_state(DB_STATE *st)
{
	DB_BUF *b;

	b = st->b;
	if (!b)
		return;
	st->b = 0;
	st->d.v = 0;
	st->s.rptr = DB_PTR_BAD;
	if (--b->lock_cnt < 0)
		dcc_logbad(EX_SOFTWARE,"negative database buffer lock");
}



void
rel_db_states(void)
{
	DB_STATE *st;

	for (st = &db_sts.rcd; st <= &db_sts.hash_ctl; ++st) {
		rel_db_state(st);
	}
}



/* release one or all unneeded buffers */
u_char					/* 0=problem 1=did nothing 2=did>=1 */
db_unload(DCC_EMSG emsg,
	  u_char some)			/* 0=all, 1=only one, 2=finished */
{
	DB_BUF *b;
	u_char result;

	result = 1;
	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->buf_type == DB_BUF_TYPE_FREE
		    || b->lock_cnt != 0)
			continue;
		if (some == 2
		    && !(b->flags & DB_BUF_FG_USE_WRITE)
		    && 0 > DCC_MADV_DONTNEED(b->buf.v))
			dcc_error_msg("madvise(DONTNEED %s,%#x): %s",
				      buf2path(b), db_pagesize, ERROR_STR());
		if (!buf_munmap(emsg, b)) {
			emsg = 0;
			result = 0;
		} else if (result) {
			result = 2;
		}
		if (some == 1)
			return result;
	}

	return result;
}



static u_char
buf_write_part(DCC_EMSG emsg, DB_BUF *b, off_t offset, void *buf, int len)
{
	int i;

	offset += (off_t)b->pg_num * (off_t)db_pagesize;

	if (offset != lseek(buf2fd(b), offset, SEEK_SET)) {
		db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
			   "buf_write_part lseek(%s,"OFF_HPAT"): %s",
			   buf2path(b), offset, ERROR_STR());
		return 0;
	}
	i = write(buf2fd(b), buf, len);
	if (i != len) {
		db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
			   "buf_write_part(%s,%u)=%d: %s",
			   buf2path(b), len, i, ERROR_STR());
		return 0;
	}

	return 1;
}



/* push part of a buffer toward the disk
 *	this can be needed even when the file has been opened and mapped
 *	read-only by dbclean */
static u_char
buf_flush_part(DCC_EMSG emsg, DB_BUF *b,
	       u_int part,		/* DB_BUF_NUM_PARTS=buffer */
	       u_char async UATTRIB)
{
	u_int flush_len;
	char *flush_base;
	DB_BUF_FM bit;

	bit = PART2BIT(part) & (b->flush | b->flush_urgent);
	if (!bit)
		return 1;

	/* Send a new buffer to disk at once. */
	if (b->flags & DB_BUF_FG_EXTENSION) {
		DB_BUF *b1, *b0;
		u_char result;

		/* To give the file system a chance to make the hash table
		 * contiguous, first write all preceding new buffers.
		 * In almost all cases, there will be none. */
		result = 1;
		do {
			b0 = b;
			for (b1 = buf_oldest; b1 != 0; b1 = b1->newer) {
				if (!(b1->flags & DB_BUF_FG_EXTENSION)
				    || b1->buf_type != b0->buf_type
				    || b1->pg_num >= b0->pg_num)
					continue;
				b0 = b1;
			}
			b0->flags &= ~DB_BUF_FG_EXTENSION;
			b0->flush = 0;
			b0->flush_urgent = 0;
			if (!db_invalidate
			    && !buf_write_part(emsg, b0,
					       0, b0->buf.c, db_pagesize))
				result = 0;
		} while (b0 != b);
		return result;
	}

	flush_base = b->ranges[part].lo;
	flush_len = b->ranges[part].hi - flush_base;
	b->flush &= ~bit;
	b->flush_urgent &= ~bit;

	if (db_invalidate)
		return 1;

	if (b->flags & DB_BUF_FG_USE_WRITE) {
		static char *wbuf;
		static u_int wbuf_len;

		/* In at least FreeBSD you cannot write() to the file
		 * that underlies a mmap() region from that region */
		if (wbuf_len < db_pagesize_part) {
			/* the page size for the current file
			 * might be different from the old file */
			if (wbuf)
				free(wbuf);
			wbuf_len = db_pagesize_part;
			wbuf = malloc(wbuf_len);
		}

		memcpy(wbuf, flush_base, flush_len);
		return buf_write_part(emsg, b, flush_base - b->buf.c,
				      wbuf, flush_len);

#ifndef HAVE_OLD_MSYNC
	} else if (async) {
		if (0 > MSYNC(flush_base, flush_len, MS_ASYNC)) {
			db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
				   "msync(db buffer %s,%#lx,%#x,MS_ASYNC): %s",
				   buf2path(b), (long)flush_base, flush_len,
				   ERROR_STR());
			return 0;
		}
#endif
	} else {
		if (0 > MSYNC(flush_base, flush_len, MS_SYNC)) {
			db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
				   "msync(db buffer %s,%#lx,%#x,MS_SYNC): %s",
				   buf2path(b), (long)flush_base, flush_len,
				   ERROR_STR());
			return 0;
		}
	}

	return 1;
}



static u_char
buf_flush(DCC_EMSG emsg, DB_BUF *b, u_char async)
{
	u_int part;
	DB_BUF_FM bits;
	u_char result = 1;

	bits = b->flush_urgent | b->flush;
	for (part = 0;  bits != 0 && part < DB_BUF_NUM_PARTS; ++part) {
		if (bits & PART2BIT(part)) {
			if (!buf_flush_part(emsg, b, part, async)) {
				emsg = 0;
				result = 0;
			}
			bits = b->flush_urgent | b->flush;
		}
	}
	return result;
}



/* Try to keep the data clean so that the fsync() required by Solaris
 *	when the file is unloaded is not too expensive.
 *	Try to flush frequently so that we don't stall as long in msync().
 */
void
db_flush_needed(void)
{
	static DB_BUF *next_b = db_bufs;
	static u_int next_part;
	DB_BUF *b;
	u_int part, all_parts;
	int buf_num;
	u_char worked;

	/* send to the disk changes that cannot be recreated by dbclean */
	if (db_urgent_need_flush_secs != 0
	    && DB_IS_TIME(db_urgent_need_flush_secs,
			  DB_URGENT_NEED_FLUSH_SECS)) {
		worked = 0;
		for (b = buf_newest; b; b = b->older) {
			if (b->buf_type == DB_BUF_TYPE_FREE)
				continue;

			for (part = 0;
			     b->flush_urgent != 0 && part < DB_BUF_NUM_PARTS;
			     ++part) {
				if ((b->flush_urgent & PART2BIT(part))) {
					buf_flush_part(0, b, part, 1);
					worked = 1;
				}
			}

			/* Switch new data pages to mmap()
			 * when this is not dbclean, since only dccd calls here
			 *	they are not using mmap()
			 *	they are either hash table pages or
			 *	    not the last page in the file */
			if ((b->flags & DB_BUF_FG_USE_WRITE)
			    && !db_use_write
			    && (b->buf_type != DB_BUF_TYPE_DB
				|| (DB_PTR2PG_NUM(db_csize-1, db_pagesize)
				    != b->pg_num))) {
				if (b->lock_cnt != 0)
					rel_db_states();
				buf_munmap(0, b);
			}
		}

		/* Keep the clock running if we did any work. This tends to
		 * avoid stalls caused by colliding with the FreeBSD syncer */
		if (worked) {
			gettimeofday(&db_time, 0);
			db_urgent_need_flush_secs = (db_time.tv_sec
						+ DB_URGENT_NEED_FLUSH_SECS);
		} else {
			db_urgent_need_flush_secs = 0;
		}
	}

	/* assume there will be nothing more to do */
	db_need_flush_secs = db_urgent_need_flush_secs;

#ifdef USE_MAP_NOSYNC
	/* if we are using mmap(MAP_NOSYNC), then there are no bits
	 * set in any b->flush words except that of the recent
	 * DB_BUF_FG_USE_WRITE extensions of the file.  It is best to let
	 * those blocks stay in RAM until the whole buffer is flushed and
	 * switched to mmap above */
	if (!db_use_write)
		return;
#endif

	b = next_b;
	part = next_part;
	all_parts =  DB_PARTS_PER_FLUSH;
	for (buf_num = DIM(db_bufs); buf_num >= 0; --buf_num) {
		if (b > LAST(db_bufs)) {
			part = 0;
			b = db_bufs;
		}
		if (!b->flush
		    || part >= DB_BUF_NUM_PARTS
		    || b->buf_type == DB_BUF_TYPE_FREE) {
			part = 0;
			++b;
			continue;
		}

		while (part < DB_BUF_NUM_PARTS) {
			if (b->flush & PART2BIT(part)) {
				buf_flush_part(0, b, part, 1);
				if (--all_parts == 0) {
					next_part = part+1;
					next_b = b;
					db_need_flush_secs = (db_time.tv_sec
							+ DB_NEED_FLUSH_SECS);
					return;
				}
				if (!b->flush)
					part = DB_BUF_NUM_PARTS;
			}
			++part;
		}
	}
}



/* occassionally flush an unlocked data buffer for dbclean
 *	dbclean mostly changes only the current record, so get started
 *	writing the data to avoid stalling the system at the end. */
u_char
db_flush_db(DCC_EMSG emsg UATTRIB)
{
#ifdef USE_MAP_NOSYNC
	DB_BUF *b;
	int limit;
	int pg_num;

	/* Gently push the new hash table to disk.
	 * The disk image will never be accurate.  This only allocates space.
	 * Do not do this for systems that lack mmap(NOSYNC) such as Linux
	 * that thrash themselves as the hash table is being built.  A
	 * long pause when the database is closed is not as bad as spending
	 * hours building the hash table. */
	while (hash_clear_pg_num < db_hash_fsize/db_hash_page_len) {
		pg_num = hash_clear_pg_num++;
		for (b = buf_oldest; b != 0; b = b->newer) {
			if (b->pg_num != pg_num
			    || b->buf_type != DB_BUF_TYPE_HASH)
				continue;
			if (!(b->flags & DB_BUF_FG_EXTENSION))
				break;
			if (b->lock_cnt != 0)
				rel_db_states();
			return buf_munmap(emsg, b);
		}

		/* look for the next page if this one has already
		 * been flushed */
	}

	/* flush some ordinary buffers */
	limit = 2;
	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->flush_urgent == 0
		    || b->buf_type == DB_BUF_TYPE_FREE
		    || b->lock_cnt != 0)
			continue;
		if (!buf_flush(emsg, b, 1))
			return 0;
		if (--limit <= 0)
			return 1;
	}
#endif
	return 1;
}



/* mark part of a buffer dirty
 *	"Urgent" changes are flushed by a timer.  Ordinary changes
 *	are often ignored and expected to be rebuilt if the system crashes.
 *	That the hash table is deleted as the system is shut down while the
 *	database must be flushed from the system's buffer cache is a reason
 *	to keep the disk image of the database good. */
void
db_set_flush(DB_STATE *st, u_char urgent, u_int len)
{
	DB_BUF *b;
	DB_BUF_FM bit, new_bits, old_bits;
	char *buf_base, *part_end, *start, *end;
	u_int part, i;

	/* nothing to do if the kernel is handling it
	 * or if we are letting this change be reconstructed by dbclean */
	b = st->b;
	if (!(b->flags & DB_BUF_FG_USE_WRITE)) {
#ifdef USE_MAP_NOSYNC
		if (!urgent)
#endif
			return;
	}

	start = st->d.c;
	buf_base = b->buf.c;

	/* Increase to even pages in the hope that the file system might
	 * be able to page-flip.  This might at least avoid reading into the
	 * buffer cache to honor a write(). Besides, Solaris' msync() handles
	 * only even pages. */
	i = (start - buf_base) % system_pagesize;
	start -= i;
	len += i;
	len = ((len + system_pagesize-1) / system_pagesize) * system_pagesize;

	end = start + len;
	if (end > buf_base+db_pagesize)
		dcc_logbad(EX_SOFTWARE, "inflated dirty buffer size");

	part = (start - buf_base) / db_pagesize_part;
	part_end = buf_base + part * db_pagesize_part;
	bit = PART2BIT(part);
	new_bits = 0;
	old_bits = b->flush | b->flush_urgent;
	do {
		part_end += db_pagesize_part;
		if (part_end > end)
			part_end = end;

		if (!(old_bits & bit)) {
			b->ranges[part].lo = start;
			b->ranges[part].hi = part_end;
		} else {
			if (b->ranges[part].lo > start)
				b->ranges[part].lo = start;
			if (b->ranges[part].hi < part_end)
				b->ranges[part].hi = part_end;
		}
		new_bits |= bit;

		start = part_end;
		bit <<= 1;
		++part;
	} while (part_end < end);

	if (urgent) {
		b->flush_urgent |= new_bits;
		if (!db_urgent_need_flush_secs) {
			db_urgent_need_flush_secs = (db_time.tv_sec
						+ DB_URGENT_NEED_FLUSH_SECS);
			if (db_need_flush_secs == 0)
				db_need_flush_secs = db_urgent_need_flush_secs;
		}
	} else {
		b->flush |= new_bits;
		if (db_need_flush_secs == 0
		    || db_need_flush_secs > db_time.tv_sec+DB_NEED_FLUSH_SECS)
			db_need_flush_secs = db_time.tv_sec+DB_NEED_FLUSH_SECS;
	}
}



/* Shut down the database, including flushing and releasing all
 *	mmap()'ed buffers
 * Do nothing to the files for mode=-1 because the file is new and garbage
 *	or the caller is a fork of the server shedding memory. */
u_char
db_close(int mode)			/* -1=invalidate, 0=dirty, 1=clean */
{
	u_char result;

	if (mode >= 0) {
		/* flush the data and then release and flush the dirty flags */
		result = make_clean(mode == 0 ? 0 : 1);
		if (!db_unload(0, 0))
			result = 0;
	} else {
		db_invalidate = 1;
		rel_db_states();
		result = (db_unload(0, 0) > 0);
	}

	/* Close the hash table first because the server is often
	 * waiting for the lock on the main file held by dbclean.
	 * Destroy the hash table if it is bad */
	if (db_hash_fd >= 0) {
		if (0 > close(db_hash_fd)) {
			dcc_pemsg(EX_IOERR, 0, "close(%s): %s",
				  db_hash_nm, ERROR_STR());
			result = 0;
		}
		db_hash_fd = -1;
	}
	if (db_fd >= 0) {
		if (0 > close(db_fd)) {
			dcc_pemsg(EX_IOERR, 0, "close(%s): %s",
				  db_nm, ERROR_STR());
			result = 0;
		}
		db_fd = -1;
	}

	db_locked.tv_sec = 0;
	return result;
}



/* Delete the hash table if the system is being rebooted and we
 * don't trust the file system to get all of the hash table.  This might
 * make system shut down faster */
void
db_stop(void)
{
	if (db_hash_fd < 0
	    || !DB_IS_LOCKED()
	    || !db_not_synced
	    || db_hash_nm[0] == '\0')
		return;

	if (0 > unlink(db_hash_nm)
	    && errno != ENOENT)
		dcc_error_msg("unlink(%s): %s", db_hash_nm, ERROR_STR());
}



/* see if (another) instance of dbclean is already running */
static int dbclean_lock_fd = -1;
static DCC_PATH dbclean_lock_nm;

u_char					/* 1=no (other) dbclean */
lock_dbclean(DCC_EMSG emsg, const char *cur_db_nm)
{
	char pid[32];
	int i;

	fnm2rel_good(dbclean_lock_nm, cur_db_nm, DB_LOCK_SUFFIX);
	dbclean_lock_fd = dcc_lock_open(emsg, dbclean_lock_nm,
					O_RDWR|O_CREAT,
					DCC_LOCK_OPEN_NOWAIT,
					DCC_LOCK_ALL_FILE, 0);
	if (dbclean_lock_fd < 0)
		return 0;

	i = 1+snprintf(pid, sizeof(pid), "%ld\n", (long)getpid());
	if (i != write(dbclean_lock_fd, pid, i))
		dcc_logbad(EX_IOERR, "write(%s, pid): %s",
			   dbclean_lock_nm, ERROR_STR());

	/* Let anyone write in it in case we are running as root
	 * and get interrupted by a crash or gdb.  A stray, stale
	 * private lock file cannot be locked */
	chmod(dbclean_lock_nm, 0666);

	return 1;
}



void
unlock_dbclean(void)
{
	if (dbclean_lock_fd >= 0) {
		if (0 > unlink(dbclean_lock_nm))
			dcc_error_msg("unlink(%s): %s",
				      dbclean_lock_nm, ERROR_STR());
		close(dbclean_lock_fd);
		dbclean_lock_fd = -1;
	}
}



/* This locking does only multiple-readers/single-writer */
int					/* -1=failed, 0=was not locked, 1=was */
db_lock(void)
{
	struct stat sb;

	if (DB_IS_LOCKED())
		return 1;

	if (!dcc_exlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, 15*60, "", db_nm))
		return -1;
	if (0 > fstat(db_fd, &sb)) {
		db_failure(__LINE__,__FILE__, EX_IOERR, 0,
			   "stat(%s): %s", db_nm, ERROR_STR());
		return -1;
	}
	if (db_fsize != (DB_HOFF)sb.st_size) {
		if (db_fsize > (DB_HOFF)sb.st_size || !db_rdonly) {
			db_failure(__LINE__,__FILE__, EX_IOERR, 0,
				   "%s size changed from "OFF_HPAT
				   " to "OFF_HPAT,
				   db_nm, db_fsize, sb.st_size);
			return -1;
		}
		db_fsize = sb.st_size;
	}

	db_locked = db_time;
	return 0;
}



/* flush buffers to make the disk reasonably correct but not perfect
 *	This does not compensate for a lack of coherent mmap() in the system.
 *
 *	It leaves the disk only as accurate as implied by db_not_synced.
 *	This flushes buffers marked either urgent and ordinarily dirty.
 *	If db_not_synced is set, then non-urgent dirty bits are not set. */
static u_char
make_clean_flush(void)
{
	DB_BUF *b;
	u_char result;

	result = 1;
	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->buf_type == DB_BUF_TYPE_FREE)
			continue;
		if (!buf_flush(0, b, 0))
			result = 0;
	}

	return result;
}



/* push all of our database changes to the disk and try to clear the dirty bit
 *	do not necessarily unmap anything */
u_char
make_clean(u_char clean)		/* 0=leave hash marked dirty, */
{					/*	1=marked clean, 2=fsync */
	u_char need_db_fsync, result;
	struct stat sb;

	rel_db_states();

	result = 1;

	/* quit if we are giving up */
	if (db_invalidate)
		return result;

	if (db_failed_line)
		clean = 0;

	if (!make_clean_flush()) {
		clean = 0;
		result = 0;
	}

	/* simply unlock all of the buffers if they are clean
	 * and do not need to (or cannot) be synchronized with fsync() */
	if (!db_dirty
	    && (clean < 2		/* not asked to synchronize */
		|| db_rdonly		/* cannot be synchronized */
		|| !db_not_synced))	/* does not need to be synchronized */
		return result;

	need_db_fsync = (clean == 2);

	/* Send the meta-data to disk so that other processes
	 * such as dbclean can find the new length of the file
	 * on Solaris.  Otherwise the file looks broken because
	 * its contained data length can be larger than its
	 * inode size on Solaris. */
	if (!need_db_fsync && clean) {
		if (0 > fstat(db_fd, &sb)) {
			dcc_error_msg("make_clean fstat(%s): %s",
				      db_nm, ERROR_STR());
			need_db_fsync = 1;
		} else if (db_fsize != (DB_HOFF)sb.st_size) {
			if (db_debug)
				quiet_trace_msg("need fsync() because db_fsize="
						OFF_HPAT" but stat="OFF_HPAT,
						db_fsize, sb.st_size);
			need_db_fsync = 1;
		}
	}

	if (need_db_fsync
	    && 0 > fsync(db_fd)) {
		dcc_error_msg("make_clean fsync(%s): %s",
			      db_nm, ERROR_STR());
		clean = 0;
		result = 0;
	}

	if (clean && !map_hash_ctl(0, 0)) {
		clean = 0;
		result = 0;
	}
	if (clean == 2) {
		if (0 > fsync(db_hash_fd)) {
			dcc_error_msg("make_clean fsync(%s): %s",
				      db_hash_nm, ERROR_STR());
			clean = 0;
			result = 0;
		} else {
			db_not_synced = 0;
			db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_NOSYNC;
			SET_FLUSH_HCTL(1);
			if (!make_clean_flush()) {
				clean = 0;
				result = 0;
			}
		}
	}

	/* Clean the dirty flag in the hash table.
	 * With luck, this will reach the disk after everything else. */
	if (clean
	    && !(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_CLEAN)) {
		db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_CLEAN;
		SET_FLUSH_HCTL(0);
	}

	/* finally flush the flag in the hash table */
	rel_db_states();
	if (!make_clean_flush())
		result = 0;

	if (clean)
		db_dirty = 0;
	return result;
}



/* mark the hash file and so the database dirty */
static u_char
db_make_dirty(DCC_EMSG emsg)
{
	if (db_dirty)
		return 1;

	if (!DB_IS_LOCKED()) {
		dcc_logbad(EX_SOFTWARE, "dirtying unlocked database");
		return 0;
	}

	if (db_rdonly)
		dcc_logbad(EX_SOFTWARE, "dirtying read-only database");

	if (!map_hash_ctl(emsg, 0))
		return 0;
	db_sts.hash_ctl.d.vals->s.flags &= ~HASH_CTL_FG_CLEAN;
#ifdef USE_MAP_NOSYNC
	if (!(db_sts.hash_ctl.d.vals->s.flags & HASH_CTL_FG_NOSYNC)) {
		db_sts.hash_ctl.d.vals->s.synced = time(0);
		db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC;
	}
	db_not_synced = 1;
#endif

	SET_FLUSH_HCTL(1);
	if (!buf_flush_part(emsg, db_sts.hash_ctl.b, 0, 0))
		return 0;

	db_dirty = 1;
	return 1;
}



/* (start to) unlock the database */
u_char					/* 0=failed, 1=at least started */
db_unlock(void)
{
	DB_BUF *b;
	int result;

	if (!DB_IS_LOCKED())
		return 1;

	/* Clear the dirty bit in the database because we may not
	 * be able to lock the database later to clear the dirty bit.
	 * Dbclean needs to see the dirty bit clear. */
	result = make_clean(1);

	/* Release DB_BUF_FG_USE_WRITE buffers because they are not consistent
	 *	among processes
	 * Release everything if dccd wants stay out of RAM in favor
	 *	of dbclean */
	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->buf_type == DB_BUF_TYPE_FREE)
			continue;
		if (db_minimum_map
		    || (b->flags & DB_BUF_FG_USE_WRITE))
			buf_munmap(0, b);
	}

	if (!dcc_unlock_fd(0, db_fd, DCC_LOCK_ALL_FILE, "", db_nm))
		result = 0;
	db_locked.tv_sec = 0;
	return result;
}



static const char *
mbyte2str(DB_PTR val)
{
	return db_ptr2str(val*1024*1024);
}



#if defined(RLIMIT_AS) || defined(RLIMIT_RSS) || defined(RLIMIT_FSIZE)
static DB_PTR
use_rlimit(int resource, const char *rlimit_nm,
	   DB_PTR cur_val, DB_PTR min_val, const char *val_nm)
{
	struct rlimit limit_old, limit_new;
	DB_PTR new_val;

	if (0 > getrlimit(resource, &limit_old)) {
		dcc_error_msg("getrlimit(%s): %s", rlimit_nm, ERROR_STR());
		return cur_val;
	}

	if ((DB_PTR)limit_old.rlim_cur >= cur_val+DB_PAD_MBYTE*1024)
		return cur_val;

	/* assume we are root and try to increase the hard limit */
	if ((DB_PTR)limit_new.rlim_max < cur_val+DB_PAD_BYTE) {
		limit_new = limit_old;
		limit_new.rlim_max = cur_val+DB_PAD_BYTE;
		if (0 > setrlimit(resource, &limit_new)) {
			if (db_debug)
				quiet_trace_msg("setrlimit(%s, "
						L_DPAT","L_DPAT"): %s",
						rlimit_nm,
						(DB_PTR)limit_new.rlim_cur,
						(DB_PTR)limit_new.rlim_max,
						ERROR_STR());
		} else {
			if (0 > getrlimit(resource, &limit_old)) {
				dcc_error_msg("getrlimit(%s): %s",
					      rlimit_nm, ERROR_STR());
				return cur_val;
			}
		}
	}

	limit_new = limit_old;
	if ((DB_PTR)limit_new.rlim_max < min_val+DB_PAD_BYTE)
		limit_new.rlim_max = min_val + DB_PAD_BYTE;
	limit_new.rlim_cur = limit_new.rlim_max;
	if ((DB_PTR)limit_new.rlim_cur > cur_val+DB_PAD_BYTE)
		limit_new.rlim_cur = cur_val+DB_PAD_BYTE;
	if (0 > setrlimit(resource, &limit_new)) {
		dcc_error_msg("setrlimit(%s, "L_DPAT","L_DPAT"): %s",
			      rlimit_nm,
			      (DB_PTR)limit_new.rlim_cur,
			      (DB_PTR)limit_new.rlim_max,
			      ERROR_STR());
		new_val = limit_old.rlim_cur - DB_PAD_BYTE;
		if (new_val < min_val)
			new_val = min_val;
	} else {
		if (limit_old.rlim_cur < limit_new.rlim_cur
		    && db_debug)
			quiet_trace_msg("increased %s from %s to %s",
					rlimit_nm,
					db_ptr2str(limit_old.rlim_cur),
#ifdef RLIM_INFINITY
					(limit_new.rlim_cur == RLIM_INFINITY)
					? "infinity" :
#endif
					db_ptr2str(limit_new.rlim_cur));
		new_val = limit_new.rlim_cur - DB_PAD_BYTE;
	}

	if (cur_val > new_val) {
		quiet_trace_msg("%s reduced %s from %s to %s",
				rlimit_nm, val_nm,
				db_ptr2str(cur_val),
				db_ptr2str(new_val));
		return new_val;
	}

	return cur_val;
}
#endif



static void
get_db_max_rss(void)
{
	DB_PTR old_val, new_val, db_min_mbyte, db_min_byte, db_max_mbyte;
	int physmem_str_len;
	DB_PTR physmem;

	/* use default maximum if maximum is bogus or unset by ./configure */
	db_max_mbyte = MAX_MAX_DB_MBYTE;
#if DB_MAX_MBYTE != 0
	db_max_mbyte = DB_MAX_MBYTE;
	if (db_max_mbyte < DB_MIN_MIN_MBYTE
	    || db_max_mbyte > MAX_MAX_DB_MBYTE) {
		quiet_trace_msg("ignore bad ./configure --with-max-db-mem=%d",
				DB_MAX_MBYTE);
		db_max_mbyte = MAX_MAX_DB_MBYTE;
	} else if (db_debug) {
		quiet_trace_msg("DB max=%s"
				" from ./configure --with-max-db-mem=%d",
				mbyte2str(db_max_mbyte), DB_MAX_MBYTE);
	}
#endif
#ifndef HAVE_BIG_FILES
	/* we need big off_t for files larger than 2 GBytes */
	if (db_max_mbyte > DB_MAX_2G_MBYTE) {
		old_val = db_max_mbyte;
		db_max_mbyte= DB_MAX_2G_MBYTE;
		if (db_debug)
			quiet_trace_msg("32-bit off_t reduced DB max from %s"
					" to %s",
					mbyte2str(old_val),
					mbyte2str(db_max_mbyte));
	}
#endif

	/* use default if ./configure --with-db-memory=MB is bogus or unset */
#if DB_MIN_MBYTE == 0
	db_min_mbyte = 64;
#else
	db_min_mbyte = DB_MIN_MBYTE;
	if (db_min_mbyte < DB_MIN_MIN_MBYTE) {
		quiet_trace_msg("ignore bad ./configure --with-db-memory=%d",
				DB_MIN_MBYTE);
		db_min_mbyte = DB_DEF_MIN_MBYTE;
	} else if (db_min_mbyte > db_max_mbyte) {
		quiet_trace_msg("ignore ./configure --with-db-memory=%d"
				" > DB max=%s",
				mbyte2str(db_max_mbyte));
		db_min_mbyte = DB_DEF_MIN_MBYTE;
	} else if (db_debug) {
		quiet_trace_msg("use ./configure --with-db-memory=%d",
				DB_MIN_MBYTE);
	}
#endif

	db_min_byte = db_min_mbyte * (1024*1024);
	db_max_byte = db_max_mbyte * (1024*1024);

#ifdef RLIMIT_FSIZE
	db_max_mbyte = (use_rlimit(RLIMIT_FSIZE, "RLIMIT_FSIZE",
				   db_max_byte, db_min_byte, "DB max")
			/ (1024*1024));
	db_max_byte = db_max_mbyte * (1024*1024);
#endif /* RLIMIT_FSIZE */

	physmem = 0;
#ifdef HAVE_PHYSMEM_TOTAL
	/* maybe someday physmem_total() will be widely available */
	physmem = physmem_total();
	if (db_debug)
		quiet_trace_msg("real=%s from physmem_total()",
				db_ptr2str(physmem));
#endif
#ifdef HAVE__SC_PHYS_PAGES
	if (physmem == 0) {
		long pages, sizepage;

		if ((pages = sysconf(_SC_PHYS_PAGES)) == -1) {
			dcc_error_msg("sysconf(_SC_PHYS_PAGES): %s",
				      ERROR_STR());
		} else if ((sizepage = sysconf(_SC_PAGESIZE)) == -1) {
			dcc_error_msg("sysconf(_SC_PAGESIZE): %s",
				      ERROR_STR());
		} else {
			physmem = (DB_PTR)pages * (DB_PTR)sizepage;
			if (db_debug)
				quiet_trace_msg("real=%s"
						" from sysconf(_SC_PHYS_PAGES)"
						" and sysconf(_SC_PAGESIZE)",
						db_ptr2str(physmem));
		}
	}
#endif
#ifdef HAVE_HW_PHYSMEM
	if (physmem == 0) {
		int mib[2] = {CTL_HW, HW_PHYSMEM};
		unsigned long int hw_physmem;
		size_t hw_physmem_len;

		hw_physmem_len = sizeof(hw_physmem);
		if (0 > sysctl(mib, 2, &hw_physmem, &hw_physmem_len, 0,0)) {
			dcc_error_msg("sysctl(HW_PHYSMEM): %s", ERROR_STR());
		} else {
			physmem = hw_physmem;
			if (db_debug)
				quiet_trace_msg("real=%s from sysctl(mib)",
						db_ptr2str(physmem));
		}
	}
#endif
#ifdef HAVE_PSTAT_GETSTATIC
	if (physmem == 0) {
		struct pst_static pss;

		if (0 > pstat_getstatic(&pss, sizeof pss, 1, 0)) {
			dcc_error_msg("pstat_getstatic(): %s", ERROR_STR());
		} else if (pss.physical_memory <= 0
			   || pss.page_size < 0) {
			dcc_error_msg("pstat_getstatic() says"
				      " physical_memory=%d page_size=%d",
				      pss.physical_memory, pss.page_size);
		} else {
			physmem = ((DB_PTR)pss.physical_memory
				   * (DB_PTR)pss.page_size);
			if (db_debug)
				quiet_trace_msg("real=%s"
						" from pstat_getstatic()",
						db_ptr2str(physmem));
		}
	}
#endif

	physmem_str_len = 0;
	db_physmem_str[0] = '\0';
	if (physmem == 0) {
		quiet_trace_msg("failed to get real memory size");
	} else {
		physmem_str_len = snprintf(db_physmem_str,
					   sizeof(db_physmem_str),
					   "  real=%s",
					   db_ptr2str(physmem));

		/* Try to use half of physical memory
		 *	if there is less than 2 GByte
		 * all except 512 MByte between 2 GByte and 4 GByte,
		 * and all but 1 GByte if there is more than 4 GByte */
		if (physmem/(1024*1024) < 2*1024)
			new_val = physmem/2;
		else if (physmem/(1024*1024) <= 4*1024)
			new_val = physmem - 512*(1024*1024);
		else
			new_val = physmem - 1024*(1024*1024);
		if (new_val < db_min_byte) {
			if (db_debug)
				quiet_trace_msg("real=%s would give DB max=%s"
						" smaller than minimum %s",
						db_ptr2str(physmem),
						db_ptr2str(new_val),
						mbyte2str(db_min_mbyte));
			new_val = db_min_byte;
		}
		if (db_max_byte > new_val) {
			old_val = db_max_byte;
			db_max_mbyte = new_val / (1024*1024);
			db_max_byte = db_max_mbyte * (1024*1024);
			if (db_debug)
				quiet_trace_msg("real=%s reduced DB max"
						" from %s to %s",
						db_ptr2str(physmem),
						db_ptr2str(old_val),
						db_ptr2str(db_max_byte));
		}
	}

	/* window need not be larger than the limit on the database size */
	db_max_rss = db_max_byte;

#ifdef RLIMIT_AS
	/* try not to break process virtual memory limit,
	 * but only if it is not ridiculously tiny */
	db_max_rss = use_rlimit(RLIMIT_AS, "RLIMIT_AS",
				db_max_rss, db_min_byte, "max RSS");
#endif /* RLIMIT_AS */
#ifdef RLIMIT_RSS
	/* try not to break process resident memory limit
	 * but only if it is not ridiculously tiny */
	db_max_rss = use_rlimit(RLIMIT_RSS, "RLIMIT_RSS",
				db_max_rss, db_min_byte, "max RSS");
#endif /* RLIMIT_RSS */

	/* limit the database to the window size */
	if (db_max_byte > db_max_rss) {
		old_val = db_max_mbyte;
		db_max_mbyte = db_max_rss / (1024*1024);
		db_max_byte = db_max_mbyte * (1024*1024);
		if (db_debug)
			quiet_trace_msg("max RSS reduced DB max from %s to %s",
					mbyte2str(old_val),
					mbyte2str(db_max_mbyte));
	}

#ifndef HAVE_64BIT_PTR
	/* We cannot use a window larger than 2 GBytes on most systems without
	 * big pointers.  Among the things that break is trying to mmap() more
	 * than 2 GBytes.  So limit the window on 32-bit systems to a little
	 * less than 2 GBytes and the database to not much more */
	if (db_max_rss > DB_MAX_2G_MBYTE*(1024*1024)) {
		if (db_debug)
			quiet_trace_msg("32-bit pointers reduced max RSS"
					" from %s to %s",
					db_ptr2str(db_max_rss),
					mbyte2str(DB_MAX_2G_MBYTE));
		db_max_rss = DB_MAX_2G_MBYTE*(1024*1024);
		new_val = db_max_rss+db_max_rss/4;
		if (db_max_byte > new_val) {
			old_val = db_max_mbyte;
			db_max_mbyte = new_val / (1024*1024);
			db_max_byte = db_max_mbyte * (1024*1024);
			if (db_debug)
				quiet_trace_msg("32-bit pointers reduced DB max"
						" from %s to %s",
						mbyte2str(old_val),
						mbyte2str(db_max_mbyte));
		}
	}
#endif

	snprintf(&db_physmem_str[physmem_str_len],
		 sizeof(db_physmem_str) - physmem_str_len,
		 "  max RSS=%s  DB max=%s",
		 db_ptr2str(db_max_rss), mbyte2str(db_max_mbyte));
}



/* Pick a buffer size that will hold an integral number of DB hash
 * table entries and is a multiple of system's page size.
 * The entire hash table should reside in memory
 * if the system has enough memory. */
u_int
db_get_pagesize(u_int old_pagesize,	/* 0 or required page size */
		u_int tgt_pagesize)	/* 0 or target page size */
{
	u_int min_pagesize, max_pagesize;

	/* Ask the operating system only once so we don't get differing
	 * answers and so compute a varying page size.
	 * Some systems can't keep their stories straight. */
	if (db_max_rss == 0)
		get_db_max_rss();

	/* Compute the least common multiple of the system page and
	 * the DB hash table entry size.
	 * This will give us the smallest page size that we can use. */
	system_pagesize = getpagesize();
	min_pagesize = lcm(system_pagesize, sizeof(HASH_ENTRY));

	/* The kludge to speed conversion of database addresses to page numbers
	 * and offsets on 32-bit systems depends on the page size being
	 * a multiple of 256 */
	if ((min_pagesize % (1<<DB_PTR_SHIFT)) != 0)
		dcc_logbad(EX_SOFTWARE, "page size not a multiple of 256");

	/* The DB buffer or page size must also be a multiple of the
	 * the end-of-page padding used in the main database file. */
	if (sizeof(DB_RCD) % DB_RCD_HDR_LEN != 0)
		dcc_logbad(EX_SOFTWARE,
			   "DB padding size %d"
			   " is not a divisor of DB entry size %d",
			   DB_RCD_HDR_LEN, ISZ(DB_RCD));
	if (DB_RCD_LEN_MAX % DB_RCD_HDR_LEN != 0)
		dcc_logbad(EX_SOFTWARE,
			   "DB record not a multiple of header size");
	min_pagesize = lcm(min_pagesize, DB_RCD_HDR_LEN);

	/* Use the old buffer size if available so we are not confused
	 * by padding at the ends of the old pages.
	 * Fail if it is impossible.  This should cause dbclean to
	 * rebuild the database. */
	if (old_pagesize != 0) {
		if ((old_pagesize % min_pagesize) != 0)
			return 0;
		/* adjust the number of buffers to fit our window size */
		db_buf_total = db_max_rss / old_pagesize;
		if (db_buf_total < (int)DB_BUF_MIN)
			return 0;
		if (db_buf_total > DB_BUF_MAX)
			db_buf_total = DB_BUF_MAX;
		return old_pagesize;
	}

	db_buf_total = DB_BUF_MAX;
	max_pagesize = db_max_rss / db_buf_total;
	max_pagesize -= max_pagesize % min_pagesize;

	/* If we have a target page size, try to use it instead of the
	 * maximum page size allowed by the resident set size.
	 * Normal DCC databases grow large and want pages as large as possible
	 * but greylist databases are often small.
	 * We also want a tiny page when first reading the parameters while
	 * opening. */
	if (tgt_pagesize != 0 && tgt_pagesize < max_pagesize) {
		tgt_pagesize -= tgt_pagesize % min_pagesize;
		if (tgt_pagesize < min_pagesize)
			tgt_pagesize = min_pagesize;
		return tgt_pagesize;
	} else if (max_pagesize > min_pagesize) {
		return max_pagesize;
	} else {
		return min_pagesize;
	}
}



/* (re)create the buffer pool
 * The buffers are small blocks that point to the real mmap()'ed memory.
 */
u_char
db_buf_init(u_int old_pagesize,		/* 0 or required page size */
	    u_int tgt_pagesize)		/* 0 or target page size */
{
	DB_BUF *b, *bprev, *bnext;
	int i;


	db_pagesize = db_get_pagesize(old_pagesize, tgt_pagesize);
	if (db_pagesize == 0)
		return 0;

	/* The fragments of pages must be multiples of system pages
	 * so that msync() on Solaris can be given multiples of system
	 * pages.  It's also a generally good idea. */
	db_pagesize_part = db_pagesize/DB_BUF_NUM_PARTS;
	db_pagesize_part = ((db_pagesize_part + system_pagesize-1)
			    / system_pagesize) * system_pagesize;

	db_page_max = db_pagesize - DB_RCD_HDR_LEN;
	db_hash_page_len = db_pagesize/sizeof(HASH_ENTRY);

	db_max_hash_entries = (MAX_HASH_ENTRIES
			       - MAX_HASH_ENTRIES % db_hash_page_len);

	memset(db_bufs, 0, sizeof(db_bufs));
	b = db_bufs;
	buf_oldest = b;
	bprev = 0;
	for (i = db_buf_total; --i != 0; b = bnext) {
		bnext = b+1;
		b->older = bprev;
		b->newer = bnext;
		bprev = b;
	}
	b->older = bprev;
	buf_newest = b;

	memset(db_buf_hash, 0, sizeof(db_buf_hash));

	return 1;
}



static u_char
make_new_hash(DCC_EMSG emsg, DB_HADDR new_hash_len)
{
	struct stat sb;
	HASH_ENTRY *hash;
	DB_HADDR next_haddr, cur_haddr, prev_haddr;
	u_int pagenum;

	if (getuid() == 0) {
		/* if we are running as root,
		 * don't change the owner of the database */
		if (0 > fstat(db_fd, &sb)) {
			dcc_pemsg(EX_IOERR, emsg, "fstat(%s): %s",
				  db_nm, ERROR_STR());
			return 0;
		}
		if (0 > fchown(db_hash_fd, sb.st_uid, sb.st_gid)) {
			dcc_pemsg(EX_IOERR, emsg, "fchown(%s,%d,%d): %s",
				  db_hash_nm, (int)sb.st_uid, (int)sb.st_gid,
				  ERROR_STR());
			return 0;
		}
	}

	if (new_hash_len < MIN_HASH_ENTRIES)
		new_hash_len = MIN_HASH_ENTRIES;

	/* Increase the requested hash table size to a multiple of the database
	 * page size.  The page size is chosen to be a multiple of the size of
	 * a single hash table entry. */
	db_hash_fsize = (((DB_HOFF)new_hash_len)*sizeof(HASH_ENTRY)
			 + db_pagesize-1);
	db_hash_fsize -= db_hash_fsize % db_pagesize;
	new_hash_len = db_hash_fsize / sizeof(HASH_ENTRY);

	if (new_hash_len > db_max_hash_entries)
		new_hash_len = db_max_hash_entries;

	/* create the empty hash table file */
	rel_db_states();
	if (!db_unload(emsg, 0))
		return 0;
	if (0 > ftruncate(db_hash_fd, 0)) {
		dcc_pemsg(EX_IOERR, emsg, "truncate(%s,"L_HPAT"): %s",
			  db_hash_nm, db_csize, ERROR_STR());
		return 0;
	}

	db_hash_len = new_hash_len;
	db_hash_used_stored_hash = db_hash_used = DB_HADDR_BASE;
	db_hash_divisor = get_db_hash_divisor(db_hash_len);

	/* Clear new hash file by linking its entries into the free list */
	/* map and clear the first page */
	if (!map_hash_ctl(emsg, 1))
		return 0;

	/* create the header */
	strcpy(db_sts.hash_ctl.d.vals->s.magic, HASH_MAGIC_STR);
	db_sts.hash_ctl.d.vals->s.free_fwd = DB_HADDR_BASE;
	db_sts.hash_ctl.d.vals->s.free_bak = db_hash_len-1;
	db_sts.hash_ctl.d.vals->s.len = db_hash_len;
	db_sts.hash_ctl.d.vals->s.divisor = db_hash_divisor;
	db_sts.hash_ctl.d.vals->s.used = DB_HADDR_BASE;
	db_sts.hash_ctl.d.vals->s.synced = time(0);
	db_dirty = 1;
#ifdef USE_MAP_NOSYNC
	db_sts.hash_ctl.d.vals->s.synced = time(0);
	db_sts.hash_ctl.d.vals->s.flags |= HASH_CTL_FG_NOSYNC;
	db_not_synced = 1;
#endif

	/* Link the hash table entries in the first and following pages.
	 * The page size is chosen to be a multiple of the size of a
	 * single hash table entry. */
	prev_haddr = FREE_HADDR_END;
	cur_haddr = DB_HADDR_BASE;
	next_haddr = cur_haddr+1;
	hash = &db_sts.hash_ctl.d.vals->h[DB_HADDR_BASE];
	pagenum = 0;
	for (;;) {
		do {
			DB_HADDR_CP(hash->bak, prev_haddr);
			if (next_haddr == db_hash_len)
				DB_HADDR_CP(hash->fwd, FREE_HADDR_END);
			else
				DB_HADDR_CP(hash->fwd, next_haddr);
			++hash;
			prev_haddr = cur_haddr;
			cur_haddr = next_haddr++;
		} while (cur_haddr % db_hash_page_len != 0);

		if (++pagenum >= db_hash_fsize/db_pagesize)
			break;

		if (!map_hash(emsg, cur_haddr, &db_sts.free, 1))
			return 0;
		db_sts.free.b->flush_urgent = (DB_BUF_FM)-1;
		hash = db_sts.free.d.h;
	}

	hash_clear_pg_num = 0;

	return 1;
}



static u_char
check_old_hash(DCC_EMSG emsg)
{
	static const u_char magic[sizeof(((HASH_CTL*)0)->s.magic)
				  ] = HASH_MAGIC_STR;
	const HASH_CTL *vals;
	struct stat sb;
	u_char old_db;

	/* check the size of the existing hash file */
	if (0 > fstat(db_hash_fd, &sb)) {
		dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s",
			  db_hash_nm, ERROR_STR());
		return 0;
	}
	db_hash_fsize = sb.st_size;
	if ((db_hash_fsize % sizeof(HASH_ENTRY)) != 0) {
		dcc_pemsg(EX_DATAERR, emsg, "%s has size "OFF_DPAT","
			  " not a multiple of %d",
			  db_hash_nm, db_hash_fsize,
			  ISZ(HASH_ENTRY));
		return 0;
	}

	db_hash_len = db_hash_fsize/sizeof(HASH_ENTRY);
	if (db_hash_len < MIN_HASH_ENTRIES) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has too few records, "OFF_DPAT" bytes",
			  db_hash_nm, db_hash_fsize);
		return 0;
	}

	/* check the magic number */
	if (!map_hash_ctl(emsg, 0))
		return 0;
	vals = db_sts.hash_ctl.d.vals;
	if (memcmp(vals->s.magic, &magic, sizeof(magic))) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has the wrong magic \"%.*s\"",
			  db_hash_nm, ISZ(HASH_ENTRY), vals->s.magic);
		return 0;
	}

	if (!(vals->s.flags & HASH_CTL_FG_CLEAN)) {
		dcc_pemsg(EX_DATAERR, emsg, "%s was not closed cleanly",
			  db_hash_nm);
		return 0;
	}
	if (vals->s.flags & HASH_CTL_FG_NOSYNC) {
#ifdef HAVE_BOOTTIME
		int mib[2] = {CTL_KERN, KERN_BOOTTIME};
		size_t boottime_len;
#endif
		struct timeval boottime;

		boottime.tv_sec = 0x7fffffff;
#ifdef HAVE_BOOTTIME
		boottime_len = sizeof(boottime);
		if (0 > sysctl(mib, 2, &boottime, &boottime_len, 0, 0)) {
			dcc_error_msg("sysctl(KERN_BOOTTIME): %s", ERROR_STR());
		}
#endif
		if (vals->s.synced <= boottime.tv_sec) {
			dcc_pemsg(EX_DATAERR, emsg, "%s was not synchronized;"
				  " synced=%d boottime=%d",
				  db_hash_nm,
				  (int)vals->s.synced, (int)boottime.tv_sec);
			return 0;
		}
		db_not_synced = 1;
	}

	if (DB_HADDR_INVALID(vals->s.free_fwd)
	    && (vals->s.free_fwd != FREE_HADDR_END
		|| vals->s.free_fwd != vals->s.free_bak)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has a broken free list head of %#x",
			  db_hash_nm, vals->s.free_fwd);
		return 0;
	}
	if (DB_HADDR_INVALID(vals->s.free_bak)
	    && (vals->s.free_bak != FREE_HADDR_END
		|| vals->s.free_fwd != vals->s.free_bak)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has a broken free list tail of %#x",
			  db_hash_nm, vals->s.free_bak);
		return 0;
	}

	if (db_hash_len != vals->s.len) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has %d entries but claims %d",
			  db_hash_nm, db_hash_len,
			  vals->s.len);
		return 0;
	}

	db_hash_divisor = vals->s.divisor;
	if (db_hash_divisor < MIN_HASH_DIVISOR
	    || db_hash_divisor >= db_hash_len) {
		dcc_pemsg(EX_DATAERR, emsg, "%s has hash divisor %d",
			  db_hash_nm, db_hash_len);
		return 0;
	}

	db_hash_used_stored_hash = db_hash_used = vals->s.used;
	if (db_hash_used < DB_HADDR_BASE) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s contains impossible %u entries",
			  db_hash_nm, HADDR2LEN(db_hash_used));
		return 0;
	}
	if (db_hash_used >= db_hash_len) {
		if (db_hash_used > db_hash_len)
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s contains only %u entries but %u used",
				  db_hash_nm,
				  HADDR2LEN(db_hash_len),
				  HADDR2LEN(db_hash_used));
		else
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s is filled with %u entries",
				  db_hash_nm,
				  HADDR2LEN(db_hash_len));
		return 0;
	}

	/* old databases lack the growth values */
	old_db = 0;
	if (!db_rdonly
	    && db_parms.old_db_csize == 0
	    && db_parms.db_added == 0
	    && db_parms.hash_used == 0
	    && db_parms.old_hash_used == 0
	    && db_parms.hash_added == 0
	    && db_parms.rate_secs == 0
	    && db_parms.last_rate_sec == 0) {
		quiet_trace_msg("repair database growth measurements");
		db_parms.old_db_csize = db_parms.db_csize;
		old_db = 1;
	}

	if (db_hash_used != db_parms.hash_used
	    && db_hash_fsize != 0) {
		if (old_db) {
			quiet_trace_msg("repair db_parms.old hash_used"
					" and old_hash_used");
			db_parms.old_hash_used = db_hash_used;
			db_parms.hash_used = db_hash_used;
		} else {
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s contains %d"
				  " entries instead of the %d that %s claims",
				  db_hash_nm, db_hash_used,
				  db_parms.hash_used, db_nm);
			return 0;
		}
	}

	db_csize_stored_hash = vals->s.db_csize;
	if (db_csize_stored_hash != db_csize
	    && db_hash_fsize != 0) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s contains "L_DPAT
			  " bytes instead of the "L_DPAT" that %s claims",
			  db_nm, db_csize,
			  db_csize_stored_hash, db_hash_nm);
		return 0;
	}

	return 1;
}



/* open the files and generally get ready to work */
u_char					/* 0=failed, 1=ok */
db_open(DCC_EMSG emsg,
	int new_db_fd,			/* -1 or already open db_fd */
	const char *new_db_nm,
	DB_HADDR new_hash_len,		/* 0 or # of entries */
	DB_OPEN_MODES mode)			/* DB_OPEN_* */
{
	u_int cur_pagesize;
	int hash_flags, db_open_flags;
	struct stat db_sb;
#	define OPEN_BAIL() {if (new_db_fd >= 0) db_fd = -1;		\
		db_close(-1); return 0;}

	db_close(1);
	db_failed_line = __LINE__;
	db_failed_file = __FILE__;
	db_not_synced = 0;
	db_minimum_map = 0;
	db_invalidate = 0;
	db_dirty = 0;
	db_locked.tv_sec = 0;

	db_rdonly = (mode & DB_OPEN_RDONLY) != 0;
	db_use_write = (mode & DB_OPEN_MMAP_WRITE) != 0;

	memset(&db_stats, 0, sizeof(db_stats));

	if (!new_db_nm && db_nm[0] == '\0')
		new_db_nm = grey_on ? DB_GREY_NAME : DB_DCC_NAME;
	if (new_db_nm) {
		if (!fnm2rel(db_nm, new_db_nm, 0)
		    || !fnm2rel(db_hash_nm, db_nm, DB_HASH_SUFFIX)) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "invalid DB nm \"%s\"", new_db_nm);
			return 0;
		}
	}

	if (new_db_fd >= 0) {
		if (new_hash_len != 0) {
			dcc_logbad(EX_SOFTWARE,
				   "extending db_open(%s) without locking",
				   db_nm);
			return 0;
		}
		if (!db_rdonly) {
			dcc_logbad(EX_SOFTWARE,
				   "db_open(%s) read/write without locking",
				   db_nm);
			return 0;
		}
		db_open_flags = O_RDONLY;
		hash_flags = O_RDONLY;

		db_fd = new_db_fd;

	} else {
		db_open_flags = O_RDWR;
		if (new_hash_len != 0) {
			if (db_rdonly) {
				dcc_logbad(EX_SOFTWARE,
					   "db_open(%s) creating read-only",
					   db_nm);
				return 0;
			}
			hash_flags = O_RDWR | O_CREAT;
		} else {
			/* must open the file read/write to lock it */
			hash_flags = O_RDWR;
		}

		db_fd = dcc_lock_open(emsg, db_nm, db_open_flags,
				      (mode & DB_OPEN_LOCK_NOWAIT)
				      ? DCC_LOCK_OPEN_NOWAIT
				      : 0,
				      DCC_LOCK_ALL_FILE, 0);
		if (db_fd == -1) {
			db_close(-1);
			return 0;
		}
	}
	gettimeofday(&db_time, 0);
	db_locked = db_time;
	if (0 > fstat(db_fd, &db_sb)) {
		dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s", db_nm, ERROR_STR());
		OPEN_BAIL();
		return 0;
	}
	db_csize = db_fsize = db_sb.st_size;
	if (db_fsize < ISZ(DB_HDR)) {
		dcc_pemsg(EX_IOERR, emsg,
			  "%s with %d bytes is too small to be a DCC database",
			  db_nm, (int)db_fsize);
		OPEN_BAIL();
	}

	/* check the header of the database file by temporarily mapping it */
	db_buf_init(0, sizeof(DB_HDR));
	if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0))
		OPEN_BAIL();

	db_parms_stored = *db_sts.db_parms.d.parms;
	db_parms = *db_sts.db_parms.d.parms;

	if (memcmp(db_parms.version, db_version_buf, sizeof(db_version_buf))) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s contains the wrong magic string \"%.*s\"",
			  db_nm, ISZ(db_parms.version), db_parms.version);
		OPEN_BAIL();
	}
	if (!(db_parms.flags & DB_PARM_FG_GREY) != !grey_on) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s is%s a greylist database but must%s be",
			  db_nm,
			  (db_parms.flags & DB_PARM_FG_GREY) ? "" : " not",
			  grey_on ? "" : " not");
		OPEN_BAIL();
	}

	cur_pagesize = db_parms.pagesize;

	DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_INVALID);
	DB_SET_NOKEEP(db_parms.nokeep_cks, DCC_CK_FLOD_PATH);
	set_db_tholds(db_parms.nokeep_cks);

	db_ck_fuzziness = grey_on ? grey_ck_fuzziness : dcc_ck_fuzziness;

	db_csize = db_parms.db_csize;
	if (db_csize < sizeof(DB_HDR)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s says it contains "L_DPAT" bytes"
			  " or fewer than the minimum of %d",
			  db_nm, db_csize, DB_PTR_BASE);
		/* that is a fatal error if we are not rebuilding */
		if (new_hash_len != 0)
			OPEN_BAIL();
	}
	if (db_csize > db_fsize) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s says it contains "L_DPAT" bytes"
			  " or more than the actual size of "OFF_DPAT,
			  db_nm, db_csize, db_fsize);
		/* that is a fatal error if we are not rebuilding */
		if (new_hash_len != 0)
			OPEN_BAIL();
	}

	/* The buffer or page size we use must be the page size used to
	 * write the files.  Try to change our size to match the file */
	if (cur_pagesize != db_pagesize) {
		db_invalidate = 1;
		rel_db_states();
		if (!db_unload(emsg, 0))
			OPEN_BAIL();
		db_invalidate = 0;
		if (!db_buf_init(cur_pagesize, 0)) {
			dcc_error_msg("%s has page size %d"
				      " incompatible with %d in %s",
				      db_nm,
				      cur_pagesize, db_get_pagesize(0, 0),
				      path2fnm(db_hash_nm));
			OPEN_BAIL();
		}
	}

	db_csize_stored_hash = 0;
	db_hash_len = 0;
	db_hash_fd = open(db_hash_nm, hash_flags, 0666);
	if (db_hash_fd < 0) {
		dcc_pemsg(EX_IOERR, emsg, "open(%s): %s",
			  db_hash_nm, ERROR_STR());
		OPEN_BAIL();
	}
	if (0 > fcntl(db_hash_fd, F_SETFD, FD_CLOEXEC)) {
		dcc_pemsg(EX_IOERR, emsg, "fcntl(%s, FD_CLOEXEC): %s",
			  db_hash_nm, ERROR_STR());
		OPEN_BAIL();
	}

	if (new_hash_len != 0) {
		if (!make_new_hash(emsg, new_hash_len))
			OPEN_BAIL();
	} else {
		if (!check_old_hash(emsg))
			OPEN_BAIL();
	}

	if (db_fsize % db_pagesize != 0) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has size "OFF_HPAT","
			  " not a multiple of its page size of %#x",
			  db_nm, db_fsize, db_pagesize);
		OPEN_BAIL();
	}
	if (db_fsize > db_csize + db_pagesize || db_csize > db_fsize) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has size "OFF_HPAT" but claims "L_HPAT,
			  db_nm, db_fsize, db_csize);
		OPEN_BAIL();
	}

#ifndef USE_MAP_NOSYNC
	/* Use `dbclean -F` on systems without mmap(NOSYNC) but with lots of
	 * RAM.  Some Linux systems otherwise take too long to run dbclean. */
	if (mode & DB_OPEN_MMAP_WRITE_NOSYNC) {
		if (db_max_rss > db_fsize + db_hash_fsize)
			db_use_write = 1;
		if (db_debug)
			quiet_trace_msg("db_max_rss="OFF_HPAT
					" db_fsize+db_hash_fsize="OFF_HPAT
					" so%s use -F",
					db_max_rss, db_fsize+db_hash_fsize,
					db_use_write ? "" : " do not");
	}
#endif

	db_window_size = (DB_PTR)db_pagesize * db_buf_total;
	snprintf(db_window_size_str, sizeof(db_window_size_str),
		 "window=%s%s",
		 db_ptr2str(db_window_size), db_physmem_str);
	rel_db_states();
	db_failed_line = 0;

	return 1;
#undef OPEN_BAIL
}



static u_char
buf_munmap(DCC_EMSG emsg, DB_BUF *b)
{
	u_char result;

	if (b->lock_cnt != 0)
		dcc_logbad(EX_SOFTWARE, "unmapping locked DB buffer");

	result = buf_flush(emsg, b, 1);

	if (db_invalidate) {
		if (0 > DCC_MADV_FREE(b->buf.v))
			dcc_error_msg("madvise(FREE %s,%#x): %s",
				      buf2path(b), db_pagesize, ERROR_STR());
	}

	if (0 > munmap(b->buf.v, db_pagesize)) {
		db_failure(__LINE__,__FILE__, EX_IOERR, emsg,
			   "munmap(%s,%d): %s",
			   buf2path(b), db_pagesize, ERROR_STR());
		result = 0;
	}
	b->buf.v = 0;
	b->pg_num = -1;
	b->buf_type = DB_BUF_TYPE_FREE;

	return result;
}



static u_char
buf_mmap(DCC_EMSG emsg, DB_BUF *b, DB_PG_NUM pg_num, u_char extend)
{
	int prot, flags;
	off_t offset;
	int fd;
	void *p;
	int retry;
	u_char unloaded;


	offset = (off_t)pg_num * (off_t)db_pagesize;
	fd = buf2fd(b);

	if (extend) {
		offset = 0;
#if defined(MAP_ANON)|| defined(MAP_ANONYMOUS)
		fd = -1;
		b->flags |= DB_BUF_FG_USE_WRITE | DB_BUF_FG_EXTENSION;
#ifdef MAP_ANONYMOUS
		/* Linux redefines things and requires either MAP_ANON
		 * or MAP_PRIVATE; */
		flags = MAP_ANONYMOUS| MAP_PRIVATE;
#else
		flags = MAP_ANON | MAP_PRIVATE;
#endif /* MAP_ANONYMOUS */
#else /* have neither MAP_ANON nor MAP_ANONYMOUS */
		b->flags |= DB_BUF_FG_USE_WRITE;
		flags = MAP_PRIVATE;
#endif
	} else if (db_rdonly) {
		flags = MAP_SHARED;
	} else if (db_use_write && !db_minimum_map) {
		/* write() buffers instead of letting the Solaris virtual
		 * memory system do it. Solaris will bog the system down doing
		 * nothing but flushing dirty mmap() pages
		 * We cannot use this hack in two processes simultaneously,
		 * so do not use it in dccd while dbclean is running */
		b->flags |= DB_BUF_FG_USE_WRITE;
		flags = MAP_PRIVATE;
	} else {
#ifdef USE_MAP_NOSYNC
		flags = (MAP_SHARED | MAP_NOSYNC);
#else
		flags = MAP_SHARED;
#endif
	}

	prot = db_rdonly ? PROT_READ : (PROT_READ | PROT_WRITE);
	for (retry = 1, unloaded = 2; unloaded > 1; ++retry) {
		p = mmap(0, db_pagesize, prot, flags, fd, offset);

		if (p == MAP_FAILED) {
			if (errno == EACCES
			    || errno == EBADF
			    || errno == EINVAL
			    || errno == ENODEV
			    || retry > 20) {
				dcc_pemsg(EX_IOERR, emsg,
					  "try #%d"" mmap(%s"
					  " %#x,%#x,%#x,%d,"OFF_HPAT"): %s",
					  retry,
					  buf2path(b),
					  db_pagesize, prot, flags, fd, offset,
					  ERROR_STR());
				return 0;
			}
			dcc_error_msg("try #%d mmap(%s"
				      " %#x,%#x,%#x,%d,"OFF_HPAT"): %s",
				      retry,
				      buf2path(b),
				      db_pagesize, prot, flags, fd, offset,
				      ERROR_STR());
/* #define MMAP_FAIL_DEBUG 3 */
#ifdef MMAP_FAIL_DEBUG
		} else if (((uint)random() % MMAP_FAIL_DEBUG) == 0) {
			/* pretend mmap() failed randomly */
			dcc_error_msg(" test fail #%d mmap(%s,%#x,"OFF_HPAT")",
				      retry,
				      buf2path(b), db_pagesize, offset);
			if (0 > munmap(p, db_pagesize))
				dcc_error_msg( "test munmap(): %s",
					      ERROR_STR());
#endif
		} else {
			/* It worked.
			 * Say so if it was not the first attempt. */
			if (retry != 1)
				dcc_error_msg("try #%d"
					      " mmap(%s,%#x,"OFF_HPAT") ok",
					      retry,
					      buf2path(b), db_pagesize, offset);
			break;
		}

		/* mmap() fails occassionally on some systems,
		 * so try to release something and try again */
		unloaded = db_unload(0, 1);
	}


	b->buf.v = p;
	b->flush = 0;
	b->flush_urgent = 0;

	if (extend)
		return 1;

	/* madvise() on some systems including FreeBSD uses a lot of CPU cycles,
	 * so it should not be done unless it is likely to do significant good.
	 * Get all of our buffers if there is plenty of memory
	 * and we are not trying to stay out of the way of dbclean. */
	if (!db_minimum_map && db_fsize <= db_max_rss) {
		/* The flat file would fit.  If the hash table would also
		 * fit, tell the kernel to be aggressive */
		if (db_fsize + db_hash_fsize <= db_max_rss
		    && 0 > DCC_MADV_WILLNEED(p))
			dcc_error_msg("madvise(WILLNEED %s,%#x): %s",
				      buf2path(b), db_pagesize, ERROR_STR());
	} else {
		if (0 > DCC_MADV_RANDOM(p))
			dcc_error_msg("madvise(RANDOM %s,%#x): %s",
				      buf2path(b), db_pagesize, ERROR_STR());
	}

	return 1;
}



/* get a free buffer for a chunk of either the hash table or database files */
static DB_BUF *
get_free_buf(DCC_EMSG emsg, DB_BUF **bh)
{
	DB_BUF *b;

	/* Look for an unlocked buffer.
	 * We know there is one because we have more buffers than
	 * can be locked simultaneously. */
	b = buf_oldest;
	for (;;) {
		if (!b)
			dcc_logbad(EX_SOFTWARE, "broken DB buffer MRU chain");
		if (!b->lock_cnt)
			break;
		b = b->newer;
	}

	/* Found an unlocked buffer.
	 * Unlink it from its hash chain. */
	if (b->fwd)
		b->fwd->bak = b->bak;
	if (b->bak)
		b->bak->fwd = b->fwd;
	else if (b->hash)
		*b->hash = b->fwd;
	if (b->buf_type != DB_BUF_TYPE_FREE) {
		if (!buf_munmap(emsg, b))
			return 0;
	}

	b->flags = 0;

	/* put it on the new hash chain */
	b->bak = 0;
	b->hash = bh;
	b->fwd = *bh;
	*bh = b;
	if (b->fwd)
		b->fwd->bak = b;

	return b;
}



static DB_BUF *
find_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_PG_NUM pg_num)
{
	DB_BUF *b, **bh;

	bh = DB_BUF_HASH(pg_num, buf_type);
	b = *bh;
	for (;;) {
		if (!b) {
			/* we ran off the end of the buffer hash chain,
			 * so get a free buffer */
			b = get_free_buf(emsg, bh);
			if (!b)
				return 0;
			b->buf_type = buf_type;
			b->pg_num = pg_num;
			break;
		}
		if (b->buf_type == buf_type
		    && b->pg_num == pg_num)
			break;		/* found the buffer we need */

		b = b->fwd;
	}

	/* make the buffer newest */
	if (buf_newest != b) {
		/* unlink it */
		b->newer->older = b->older;
		if (b->older)
			b->older->newer = b->newer;
		else
			buf_oldest = b->newer;
		/* insert it at the head of the MRU list */
		b->newer = 0;
		b->older = buf_newest;
		buf_newest->newer = b;
		buf_newest = b;
	}

	return b;
}



static DB_BUF *
find_st_buf(DCC_EMSG emsg, DB_BUF_TYPE buf_type, DB_STATE *st,
	    DB_PG_NUM pg_num, u_char extend)
{
	DB_BUF *b;

	/* release previous buffer unless it is the right one */
	b = st->b;
	if (b) {
		if (b->pg_num == pg_num
		    && b->buf_type == buf_type)
			return b;	/* already have the target buffer */

		st->b = 0;
		st->d.v = 0;
		if (--b->lock_cnt < 0)
			dcc_logbad(EX_SOFTWARE, "bad database buffer lock");
	}

	/* look for the buffer */
	b = find_buf(emsg, buf_type, pg_num);
	if (!b)
		return 0;

	++b->lock_cnt;
	if (b->buf.v) {
		if (extend && !(b->flags & DB_BUF_FG_USE_WRITE))
			dcc_logbad(EX_SOFTWARE, "extending ordinary buffer");

	} else {
		/* map it if it was not already known */
		if (!buf_mmap(emsg, b, pg_num, extend)) {
			b->buf_type = DB_BUF_TYPE_FREE;
			b->pg_num = -1;
			if (--b->lock_cnt != 0)
				dcc_logbad(EX_SOFTWARE,
					   "stolen database buffer lock %d",
					   b->lock_cnt);
			return 0;
		}
		if (buf_type == DB_BUF_TYPE_DB)
			++db_stats.db_mmaps;
		else if (buf_type == DB_BUF_TYPE_HASH)
			++db_stats.hash_mmaps;
	}

	st->b = b;
	st->d.v = 0;
	return b;
}



static u_char
map_hash_ctl(DCC_EMSG emsg, u_char new)
{
	DB_BUF *b;

	b = find_st_buf(emsg, DB_BUF_TYPE_HASH, &db_sts.hash_ctl, 0, new);
	if (!b)
		return 0;
	db_sts.hash_ctl.s.haddr = 0;
	db_sts.hash_ctl.d.v = b->buf.v;
	return 1;
}



/* mmap() a hash table entry */
static u_char
map_hash(DCC_EMSG emsg,
	 DB_HADDR haddr,		/* this entry */
	 DB_STATE *st,			/* point this to the entry */
	 u_char new)
{
	DB_PG_NUM pg_num;
	DB_PG_OFF pg_off;
	DB_BUF *b;

	if (haddr >= db_hash_len || haddr < DB_HADDR_BASE) {
		dcc_pemsg(EX_DATAERR, emsg, "invalid hash address %#x",
			  haddr);
		return 0;
	}

	pg_num = haddr / db_hash_page_len;
	pg_off = haddr % db_hash_page_len;

	b = find_st_buf(emsg, DB_BUF_TYPE_HASH, st, pg_num, new);
	if (!b)
		return 0;
	st->s.haddr = haddr;
	st->d.h = &b->buf.h[pg_off];
	return 1;
}



/* unlink a hash table entry from the free list
 *	uses db_sts.tmp */
static u_char
unlink_free_hash(DCC_EMSG emsg,
		 DB_STATE *hash_st)	/* remove this from the free list */
{
	DB_HADDR fwd, bak;

	if (!db_make_dirty(emsg))
		return 0;

	fwd = DB_HADDR_EX(hash_st->d.h->fwd);
	bak = DB_HADDR_EX(hash_st->d.h->bak);
	if (!HE_IS_FREE(hash_st->d.h)
	    || (DB_HADDR_INVALID(fwd) && fwd != FREE_HADDR_END)
	    || (DB_HADDR_INVALID(bak) && bak != FREE_HADDR_END)
	    || DB_HPTR_EX(hash_st->d.h->rcd) != DB_PTR_NULL) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "bad hash free list entry at %#x", hash_st->s.haddr);
		return 0;
	}

	if (fwd != FREE_HADDR_END) {
		if (!map_hash(emsg, fwd, &db_sts.tmp, 0))
			return 0;
		if (DB_HADDR_EX(db_sts.tmp.d.h->bak) != hash_st->s.haddr) {
			dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
				  hash_st->s.haddr, fwd);
			return 0;
		}
		DB_HADDR_CP(db_sts.tmp.d.h->bak, bak);
		SET_FLUSH_HE(&db_sts.tmp);
	} else {
		if (!map_hash_ctl(emsg, 0))
			return 0;
		if (db_sts.hash_ctl.d.vals->s.free_bak != hash_st->s.haddr) {
			dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
				  hash_st->s.haddr, fwd);
			return 0;
		}
		db_sts.hash_ctl.d.vals->s.free_bak = bak;
		SET_FLUSH_HCTL(0);
	}

	if (bak != FREE_HADDR_END) {
		if (!map_hash(emsg, bak, &db_sts.tmp, 0))
			return 0;
		if (DB_HADDR_EX(db_sts.tmp.d.h->fwd) != hash_st->s.haddr) {
			dcc_pemsg(EX_DATAERR, emsg, "bad free %#x <-- free %#x",
				  bak, hash_st->s.haddr);
			return 0;
		}
		DB_HADDR_CP(db_sts.tmp.d.h->fwd, fwd);
		SET_FLUSH_HE(&db_sts.tmp);
	} else {
		if (!map_hash_ctl(emsg, 0))
			return 0;
		if (db_sts.hash_ctl.d.vals->s.free_fwd != hash_st->s.haddr) {
			dcc_pemsg(EX_DATAERR, emsg, "free %#x --> bad-free %#x",
				  hash_st->s.haddr, bak);
			return 0;
		}
		db_sts.hash_ctl.d.vals->s.free_fwd = fwd;
		SET_FLUSH_HCTL(0);
	}

	memset(hash_st->d.h, 0, sizeof(HASH_ENTRY));
	SET_FLUSH_HE(hash_st);

	++db_hash_used;
	return 1;
}



/* get a free hash table entry and leave db_sts.free pointing to it */
static u_char				/* 0=failed, 1=got it */
get_free_hash(DCC_EMSG emsg,
	      DB_HADDR result)		/* try near here */
{
	DB_HADDR pg_start, pg_lim, bak;
	int i;

	if (db_hash_len <= db_hash_used) {
		dcc_pemsg(EX_OSFILE, emsg, "no free hash table entry;"
			  " %d of %d used", db_hash_used, db_hash_len);
		return 0;
	}

	/* Look first near the target */
	if (result < DB_HADDR_BASE)
		result = DB_HADDR_BASE;
	pg_start = result - (result % db_hash_page_len);
	pg_lim = pg_start + db_hash_page_len-1;
	if (pg_lim >= db_hash_len)
		pg_lim = db_hash_len-1;
	for (i = 0; i < 3 && ++result < pg_lim; ++i) {
		if (!map_hash(emsg, result, &db_sts.free, 0))
			return 0;
		if (HE_IS_FREE(db_sts.free.d.h))
			return unlink_free_hash(emsg, &db_sts.free);
	}

	/* check the local ad hoc free list at the end of the page */
	if (!map_hash(emsg, pg_lim, &db_sts.free, 0))
		return 0;
	if (HE_IS_FREE(db_sts.free.d.h)) {
		/* the ad hoc free list is not empty,
		 * so try to use the previous entry */
		bak = DB_HADDR_EX(db_sts.free.d.h->bak);
		if (bak != FREE_HADDR_END) {
			if (!map_hash(emsg, bak, &db_sts.free, 0))
				return 0;
		}
		return unlink_free_hash(emsg, &db_sts.free);
	}


	/* Give up and search from the start of the free list.  This happens
	 * only when the current and all preceding pages are full. */
	if (!map_hash_ctl(emsg, 0))
		return 0;
	result = db_sts.hash_ctl.d.vals->s.free_fwd;
	if (DB_HADDR_INVALID(result)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "broken hash free list head of %#x", result);
		return 0;
	}
	if (!map_hash(emsg, result, &db_sts.free, 0))
		return 0;
	return unlink_free_hash(emsg, &db_sts.free);
}



/* mmap() a database entry
 *	We assume that no database entry spans buffers,
 *	and that there are enough buffers to accomodate all possible
 *	concurrent requests. */
static u_char
map_db(DCC_EMSG emsg,
       DB_PTR rptr,			/* address of the record */
       u_int tgt_len,			/* its length */
       DB_STATE *st,			/* point this to the record */
       u_char extend)
{
	DB_PG_NUM pg_num;
	DB_PG_OFF pg_off;
	DB_BUF *b;

	if (rptr+tgt_len > db_fsize) {
		db_failure(__LINE__,__FILE__, EX_DATAERR, emsg,
			   "invalid database address "L_HPAT" or length %d"
			   " past db_fsize "OFF_HPAT" in %s",
			   rptr, tgt_len, db_fsize, db_nm);
		return 0;
	}

	/* Try to optimize this to avoid udivdi3() and umoddi3(),
	 * because they are a major time sink here on 32-bit systems */
	pg_num = DB_PTR2PG_NUM(rptr, db_pagesize);
#ifdef HAVE_64BIT_LONG
	pg_off = rptr % db_pagesize;
#else
	pg_off = rptr - pg_num*(DB_PTR)db_pagesize;
#endif

	/* do not go past the end of a buffer */
	if (tgt_len+pg_off > db_pagesize) {
		db_failure(__LINE__,__FILE__, EX_DATAERR, emsg,
			   "invalid database address "L_HPAT
			   " or length %#x in %s",
			   rptr, tgt_len, db_nm);
		return 0;
	}

	b = find_st_buf(emsg, DB_BUF_TYPE_DB, st, pg_num, extend);
	if (!b)
		return 0;
	st->s.rptr = rptr;
	st->d.r = (DB_RCD *)&b->buf.c[pg_off];
	return 1;
}



u_char					/* 0=failed, 1=got it */
db_map_rcd(DCC_EMSG emsg,
	   DB_STATE *rcd_st,		/* point this to the record */
	   DB_PTR rptr,			/* that is here */
	   int *rcd_lenp)		/* put its length here */
{
	u_int rcd_len;

	if (DB_PTR_IS_BAD(rptr)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "getting bogus record at "L_HPAT", in %s",
			  rptr, db_nm);
		return 0;
	}

	if (!map_db(emsg, rptr, DB_RCD_HDR_LEN, rcd_st, 0))
		return 0;
	rcd_len = DB_RCD_LEN(rcd_st->d.r);

	if (&rcd_st->d.c[rcd_len] > &rcd_st->b->buf.c[db_pagesize]) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "invalid checksum count %d at "L_HPAT" in %s",
			  DB_NUM_CKS(rcd_st->d.r), rptr, db_nm);
		return 0;
	}

	if (rcd_lenp)
		*rcd_lenp = rcd_len;
	return 1;
}



/* write the new sizes of the files into the files */
static u_char
db_set_sizes(DCC_EMSG emsg)
{
	u_char result = 1;

	if (db_hash_fd != -1
	    && (db_csize_stored_hash != db_csize
		|| db_hash_used_stored_hash != db_hash_used)) {
		if (!map_hash_ctl(emsg, 0)) {
			result = 0;
		} else {
			db_sts.hash_ctl.d.vals->s.db_csize = db_csize;
			db_csize_stored_hash = db_csize;

			db_sts.hash_ctl.d.vals->s.used = db_hash_used;
			db_hash_used_stored_hash = db_hash_used;

			SET_FLUSH_HCTL(0);
		}
	}

	if (db_fd != -1
	    && (db_parms_stored.db_csize != db_csize
		|| db_parms_stored.hash_used != db_hash_used)) {
		if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0)) {
			result = 0;
		} else {
			db_sts.db_parms.d.parms->db_csize = db_csize;
			db_parms_stored.db_csize = db_csize;
			db_parms.db_csize = db_csize;

			db_sts.db_parms.d.parms->hash_used = db_hash_used;
			db_parms_stored.hash_used = db_hash_used;
			db_parms.hash_used = db_hash_used;

			db_sts.db_parms.d.parms->last_rate_sec = db_time.tv_sec;
			db_parms_stored.last_rate_sec = db_time.tv_sec;
			db_parms.last_rate_sec = db_time.tv_sec;

			db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS));
		}
	}

	return result;
}



/* write the database parameters into the magic number headers of the files */
u_char
db_flush_parms(DCC_EMSG emsg)
{
	if (!db_set_sizes(emsg))
		return 0;

	if (db_fd == -1)
		return 1;

	if (memcmp(&db_parms, &db_parms_stored, sizeof(db_parms))) {
		if (!map_db(emsg, 0, sizeof(DB_HDR), &db_sts.db_parms, 0))
			return 0;

		db_parms.pagesize = db_pagesize;

		*db_sts.db_parms.d.parms = db_parms;
		db_parms_stored = db_parms;

		db_set_flush(&db_sts.db_parms, 1, sizeof(DB_PARMS));
	}

	return 1;
}



/* find a checksum in an already mapped record */
DB_RCD_CK *				/* 0=not found, 1=broken database */
db_find_ck(DCC_EMSG emsg,
	   DB_RCD *rcd,
	   DB_PTR rptr,
	   DCC_CK_TYPES type)		/* find this type of checksum */
{
	DB_RCD_CK *rcd_ck;
	int i;

	rcd_ck = rcd->cks;
	i = DB_NUM_CKS(rcd);
	if (i >= DCC_NUM_CKS) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "impossible %d checksums in "L_HPAT" in %s",
			  i, rptr, db_nm);
		return (DB_RCD_CK *)1;
	}

	for (; i != 0; --i, ++rcd_ck) {
		if (DB_CK_TYPE(rcd_ck) == type)
			return rcd_ck;
	}

	return 0;
}



/* find a checksum type known to be in a record */
DB_RCD_CK *				/* 0=it's not there */
db_map_rcd_ck(DCC_EMSG emsg,
	      DB_STATE *rcd_st,		/* point this to the record */
	      DB_PTR rptr,		/* that is here */
	      DCC_CK_TYPES type)	/* find this type of checksum */
{
	DB_RCD_CK *rcd_ck;

	if (!db_map_rcd(emsg, rcd_st, rptr, 0))
		return 0;

	rcd_ck = db_find_ck(emsg, rcd_st->d.r, rptr, type);
	if (rcd_ck == (DB_RCD_CK *)1)
		return 0;
	if (rcd_ck == 0) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "missing \"%s\" checksum in "L_HPAT" in %s",
			  DB_TYPE2STR(type), rptr, db_nm);
		return 0;
	}
	return rcd_ck;
}



static inline u_char			/* 1=has a small prime factor */
modulus_has_divisor(DB_HADDR len)
{
	static int primes[] = {
		3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
		61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
		131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
		197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
		271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
		353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,
		433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499};
	int *p;

	for (p = &primes[0]; p <= LAST(primes); ++p) {
		if ((len % *p) == 0)
			return 1;
	}
	return 0;
}



/* Get a modulus for the hash function that is tolerably likely to be
 * relatively prime to most inputs.  The worst that happens when the modulus
 * is composite is that large multiples of its factors will suffer more
 * collisions. */
DB_HADDR
get_db_hash_divisor(DB_HADDR len)
{
	DB_HADDR divisor;

	divisor = len - DB_HADDR_BASE;
	if (!(divisor & 1))
		--divisor;
	while (divisor >= MIN_HASH_ENTRIES) {
		if (modulus_has_divisor(divisor))
			divisor -= 2;
		else
			break;
	}
	return divisor;
}



DB_HADDR
db_hash(DCC_CK_TYPES type, const DCC_SUM sum)
{
	u_int64_t accum, wrap;
	const u_int32_t *wp;
	union {
	    DCC_SUM	sum;
	    u_int32_t	words[4];
	} buf;
	int align;
	DB_HADDR haddr;

#ifdef HAVE_64BIT_PTR
	align = (u_int64_t)sum & 3;
#else
	align = (u_int)sum & 3;
#endif
	if (align == 0) {
		/* We almost always take this branch because database
		 * records contain 12+N*24 bytes.  That also implies that
		 * we should not hope for better than 4 byte alignment. */
		wp = (u_int32_t *)sum;
	} else {
		memcpy(buf.sum, sum, sizeof(buf.sum));
		wp = buf.words;
	}

	/* MD5 checksums are uniformly distributed, and so DCC_SUMs are
	 * directly useful for hashing except when they are server-IDs */
	accum = *wp++;
	accum += *wp++;
	wrap = accum >>32;
	accum <<= 32;
	accum += wrap + type;
	accum += *wp++;
	accum += *wp;

	haddr = accum % db_hash_divisor;
	haddr += DB_HADDR_BASE;

	/* do not hash into the last slot of a page, because it is used to
	 * find local free slots */
	if (haddr % db_hash_page_len == db_hash_page_len-1) {
		++haddr;
		if (haddr >= db_hash_len)
			haddr = DB_HADDR_BASE;
	}
	return haddr;
}



/* look for a checksum in the hash table
 *	return with an excuse, the home slot, or the last entry on
 *	the collision chain */
DB_FOUND
db_lookup(DCC_EMSG emsg, DCC_CK_TYPES type, const DCC_SUM sum,
	  DB_HADDR lo,			/* postpone if out of this window */
	  DB_HADDR hi,
	  DB_STATE *hash_st,		/* hash block for record or related */
	  DB_STATE *rcd_st,		/* put the record or garbage here */
	  DB_RCD_CK **prcd_ck)		/* point to cksum if found */
{
	DB_HADDR haddr, haddr_fwd, haddr_bak;
	DB_PTR db_ptr;
	DB_RCD_CK *found_ck;
	DB_HADDR failsafe;

	haddr = db_hash(type, sum);
	if (haddr < lo || haddr > hi) {
		if (lo == 0 && hi == MAX_HASH_ENTRIES) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "out of range hash address");
			return DB_FOUND_SYSERR;
		}
		return DB_FOUND_LATER;
	}

	if (prcd_ck)
	    *prcd_ck = 0;

	if (!map_hash(emsg, haddr, hash_st, 0))
		return DB_FOUND_SYSERR;

	if (HE_IS_FREE(hash_st->d.h))
		return DB_FOUND_EMPTY;

	if (!DB_HADDR_C_NULL(hash_st->d.h->bak))
		return DB_FOUND_INTRUDER;

	/* We know that the current hash table entry is in its home slot.
	 * It might be for the key or checksum we are looking for
	 * or it might be for some other checksum with the same hash value. */
	for (failsafe = 0; failsafe <= db_hash_len; ++failsafe) {
		if (HE_CMP(hash_st->d.h, type, sum)) {
			/* This hash table entry could be for our target
			 * checksum.  Read the corresponding record so we
			 * decide whether we have a hash collision or we
			 * have found a record containing our target checksum.
			 *
			 * find right type of checksum in the record */
			db_ptr = DB_HPTR_EX(hash_st->d.h->rcd);
			found_ck = db_map_rcd_ck(emsg, rcd_st, db_ptr, type);
			if (!found_ck)
				return DB_FOUND_SYSERR;
			if (!memcmp(sum, found_ck->sum,
				    sizeof(DCC_SUM))) {
				if (prcd_ck)
					*prcd_ck = found_ck;
				return DB_FOUND_IT;
			}
		}

		/* This DB record was a hash collision, or for a checksum
		 * other than our target.
		 * Fail if this is the end of the hash chain */
		haddr_fwd = DB_HADDR_EX(hash_st->d.h->fwd);
		if (haddr_fwd == DB_HADDR_NULL)
			return DB_FOUND_CHAIN;

		if (DB_HADDR_INVALID(haddr_fwd)) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "broken hash chain fwd-link"
				  " #%d %#x at %#x in %s",
				  failsafe, haddr_fwd, haddr, db_hash_nm);
			return DB_FOUND_SYSERR;
		}

		if (!map_hash(emsg, haddr_fwd, hash_st, 0))
			return DB_FOUND_SYSERR;

		haddr_bak = DB_HADDR_EX(hash_st->d.h->bak);
		if (haddr_bak != haddr) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "broken hash chain links #%d,"
				  " %#x-->%#x but %#x<--%#x in %s",
				  failsafe,
				  haddr, haddr_fwd,
				  haddr_bak, haddr_fwd,
				  db_hash_nm);
			return DB_FOUND_SYSERR;
		}
		haddr = haddr_fwd;
	}
	dcc_pemsg(EX_DATAERR, emsg, "infinite hash chain at %#x in %s",
		  haddr, db_hash_nm);
	return DB_FOUND_SYSERR;
}



/* combine checksums */
DCC_TGTS
db_sum_ck(DCC_TGTS prev,		/* previous sum */
	  DCC_TGTS rcd_tgts,		/* from the record */
	  DCC_CK_TYPES type UATTRIB)
{
	DCC_TGTS res;

	/* This arithmetic must be commutative (after handling deleted
	 * values), because inter-server flooding causes records to appear in
	 * the database out of temporal order.
	 *
	 * DCC_TGTS_TOO_MANY can be thought of as a count of plus infinity.
	 * DCC_TGTS_OK is like minus infinity.
	 * DCC_TGTS_OK2 like half of minus infinity
	 * DCC_TGTS_TOO_MANY (plus infinity) added to DCC_TGTS_OK (minus
	 *	infinity) or DCC_TGTS_OK2 yields DCC_TGTS_OK or DCC_TGTS_OK2.
	 *
	 * Reputations never reach infinity.
	 *
	 * Claims of not-spam from all clients are discarded as they arrive
	 * and before here. They can only come from the local white list
	 */
#define SUM_OK_DEL(p,r) {						    \
		if (rcd_tgts == DCC_TGTS_OK || prev == DCC_TGTS_OK)	    \
			return DCC_TGTS_OK;				    \
		if (rcd_tgts == DCC_TGTS_OK2 || prev == DCC_TGTS_OK2)	    \
			return DCC_TGTS_OK2;				    \
		if (rcd_tgts == DCC_TGTS_DEL)				    \
			return prev;					    \
	}

	res = prev+rcd_tgts;
	if (res <= DCC_TGTS_TOO_MANY)
		return res;

	SUM_OK_DEL(prev, rcd_tgts);
	return DCC_TGTS_TOO_MANY;
#undef SUM_OK_DEL
}



/* delete all reports that contain the given checksum */
static u_char				/* 1=done, 0=broken database */
del_ck(DCC_EMSG emsg,
       DCC_TGTS *res,			/* residual targets after deletion */
       const DB_RCD *new,		/* delete reports older than this one */
       DCC_CK_TYPES type,		/* delete this type of checksum */
       DB_RCD_CK *prev_ck,		/* starting with this one */
       DB_STATE *prev_st)		/* use this scratch state block */
{
	DB_PTR prev;

	*res = 0;
	for (;;) {
		/* delete reports that are older than the delete request */
		if (dcc_ts_newer_ts(&new->ts, &prev_st->d.r->ts)
		    && DB_RCD_ID(prev_st->d.r) != DCC_ID_WHITE) {
			DB_TGTS_RCD_SET(prev_st->d.r, 0);
			DB_TGTS_CK_SET(prev_ck, 0);
			SET_FLUSH_RCD(prev_st, 1);

		} else {
			/* sum reports that are not deleted */
			*res = db_sum_ck(*res, DB_TGTS_RCD(prev_st->d.r), type);
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL)
			return 1;
		prev_ck = db_map_rcd_ck(emsg, prev_st, prev, type);
		if (!prev_ck)
			return 0;
	}
}



/* see if the new and preceding records are from the same era */
static inline u_char			/* 1=different eras */
ck_old_spam(const DB_RCD *new, const DCC_TS* prev, DCC_CK_TYPES type)
{
	struct timeval tv;
	time_t secs;
	DCC_TS past;

	secs = db_parms.ex_secs[type].spam;
	if (secs > DCC_OLD_SPAM_SECS)
		secs = DCC_OLD_SPAM_SECS;
	dcc_ts2timeval(&tv, &new->ts);
	dcc_timeval2ts(&past, &tv, -secs);

	return dcc_ts_older_ts(prev, &past);
}



/* Mark reports made obsolete by a spam report
 *	A new report of spam makes sufficiently old reports obsolete.
 *
 *	Sufficiently recent non-obsolete reports make a new report obsolete,
 *	or at least not worth spending bandwidth to flood.
 *	"Sufficiently recent" should be defined so that this server and
 *	its downstream flooding peers always have reports of the checksums
 *	in the report.  So we want to keep (not make obsolete) at least one
 *	report per expiration duration.  We cannot know the expiration durations
 *	of our peers, but we known DB_EXPIRE_SPAMSECS_DEF_MIN which influences
 *	DCC_OLD_SPAM_SECS.
 *
 *	However, if another checksum in the new report was kept, then
 *	prefer marking old checksums obsolete.
 *
 *	db_sts.rcd points to the new record
 *	db_sts.rcd2 points the the previous record and is changed
 */
static u_char				/* 1=done, 0=broken database */
ck_obs_spam(DCC_EMSG emsg,
	    const DB_RCD *new,
	    DCC_TGTS new_tgts,
	    DB_RCD_CK *new_ck,
	    DCC_CK_TYPES type,		/* check this type of checksum */
	    DB_RCD_CK *prev_ck,		/* starting with this one */
	    DCC_TGTS prev_ck_tgts,
	    u_char *keeping_new)	/* 1=already keeping the new record */
{
	int limit;
	DB_PTR prev;

	limit = 100;
	for (;;) {
		/* preceding white listed entries make new entries obsolete */
		if (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE) {
			new_ck->type_fgs |= DB_CK_FG_OBS;
			SET_FLUSH_RCD(&db_sts.rcd, 1);
			return 1;
		}

		if (DB_CK_OBS(prev_ck)
		    || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) {
			/* notice duplicates and
			 * don't look forever for recent non-obsolete report */
			if (!memcmp(&new->ts, &db_sts.rcd2.d.r->ts,
				    sizeof(new->ts))
			    || --limit == 0) {
				*keeping_new = 1;
				return 1;
			}

		} else if (prev_ck_tgts != DCC_TGTS_TOO_MANY) {
			/* Mark this predecessor obsolete because it
			 * was before the checksum became spam. */
			prev_ck->type_fgs |= DB_CK_FG_OBS;
			SET_FLUSH_RCD(&db_sts.rcd2, 0);

			/* continue backwards to mark more non-spam
			 * predecessors obsolete */

		} else if (!*keeping_new
			   && ck_old_spam(new, &db_sts.rcd2.d.r->ts, type)) {
			/* We do not yet have a reason to keep the new report
			 * and this predecessor is at or after a spam report.
			 * We need the new report because it and the
			 * predecessor are from different eras.
			 * If the new report is not of spam, it will be
			 * compressed with a preceding spam report. */
			*keeping_new = 1;
			/* The predecessor is not needed if the new record
			 * is for spam */
			if (new_tgts == DCC_TGTS_TOO_MANY) {
				prev_ck->type_fgs |= DB_CK_FG_OBS;
				SET_FLUSH_RCD(&db_sts.rcd2, 0);
			}
			/* We're finished, because all older preceding reports
			 * were marked obsolete when this older predecessor
			 * was linked. */
			return 1;

		} else {
			/* this predecessor is about as recent as the new
			 * record, so the new record is unneeded noise that
			 * would bloat other servers' databases. */
			new_ck->type_fgs |= DB_CK_FG_OBS;
			return 1;
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL) {
			/* the new record is a new report of spam */
			*keeping_new = 1;
			return 1;
		}

		prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2, prev, type);
		if (!prev_ck)
			return 0;
		prev_ck_tgts = DB_TGTS_CK(prev_ck);
	}
}



/* mark extra server-ID declarations obsolete
 *
 *	db_sts.rcd points to the new record
 *	db_sts.rcd2 points the the previous record and is changed */
 static u_char				/* 1=done, 0=broken database */
srvr_id_ck(DCC_EMSG emsg,
	   const DB_RCD *new,
	   DB_RCD_CK *new_ck,
	   DB_RCD_CK *prev_ck)		/* starting with this one */
{
	DB_PTR prev;
	DCC_SRVR_ID new_id, prev_id;
	struct timeval tv;
	DCC_TS week_ts;

	dcc_ts2timeval(&tv, &new->ts);
	tv.tv_usec = 0;
	tv.tv_sec -= tv.tv_sec % (7*24*60*60);
	dcc_timeval2ts(&week_ts, &tv, 0);

	new_id = DB_RCD_ID(new);
	for (;;) {
		/* mark duplicate older declarations and deletions obsolete */
		prev_id = DB_RCD_ID(db_sts.rcd2.d.r);
		if (!DCC_ID_SRVR_TYPE(prev_id)
		    || DB_TGTS_RCD(db_sts.rcd2.d.r) == 0) {
			if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) {
				new_ck->type_fgs |= DB_CK_FG_OBS;
				SET_FLUSH_RCD(&db_sts.rcd, 1);
			} else {
				prev_ck->type_fgs |= DB_CK_FG_OBS;
				SET_FLUSH_RCD(&db_sts.rcd2, 1);
			}
			return 1;
		}

		/* Keep many identical type declarations as a kludge to ensure
		 * that rewound flooding sends type declarations early.
		 * Keep only one delcaration per week. */
		if (DCC_ID_SRVR_TYPE(new_id)) {
			/* Zap the new declaration and stop if the
			 * new declaration is older than the predecessor. */
			if (dcc_ts_newer_ts(&db_sts.rcd2.d.r->ts, &new->ts)) {
				new_ck->type_fgs |= DB_CK_FG_OBS;
				SET_FLUSH_RCD(&db_sts.rcd, 1);
				return 1;
			}

			/* Stop when we find a duplicate type declaration
			 * of a different week */
			if (prev_id == new_id
			    && dcc_ts_older_ts(&db_sts.rcd2.d.r->ts,
					       &week_ts)) {
				return 1;
			}

			/* continue zapping preceding declarations */
			prev_ck->type_fgs |= DB_CK_FG_OBS;
			SET_FLUSH_RCD(&db_sts.rcd2, 1);
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL)
			return 1;

		prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2,
					prev, DCC_CK_SRVR_ID);
		if (!prev_ck)
			return 0;
	}
}



/* Install pointers in the hash table for a record and fix the accumulated
 *	counts in the record pointed to by db_sts.rcd
 *	Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp
 *	The caller must deal with db_make_dirty() */
u_char					/* 0=failed, 1=done */
db_link_rcd(DCC_EMSG emsg, DB_HADDR lo, DB_HADDR hi)
{
	DCC_TGTS res;
	DB_RCD *rcd;
	DB_RCD_CK *prev_ck;
	DB_RCD_CK *rcd_ck;
	DCC_CK_TYPES rcd_type;
	DCC_TGTS rcd_tgts, prev_ck_tgts;
	int ck_num;
	DB_HADDR haddr;
	u_char keeping_new;

	keeping_new = 0;
	rcd = db_sts.rcd.d.r;
	rcd_tgts = DB_TGTS_RCD_RAW(rcd);
	rcd_ck = rcd->cks;
	ck_num = DB_NUM_CKS(rcd);
	if (ck_num > DIM(rcd->cks)) {
		dcc_pemsg(EX_OSFILE, emsg,
			  "bogus checksum count %#x at "L_HPAT" in %s",
			  rcd->fgs_num_cks, db_sts.rcd.s.rptr, db_nm);
		return 0;
	}
	for (; ck_num > 0; --ck_num, ++rcd_ck) {
		rcd_type = DB_CK_TYPE(rcd_ck);
		if (!DCC_CK_OK_DB(grey_on, rcd_type)) {
			dcc_pemsg(EX_OSFILE, emsg,
				  "invalid checksum type %s at "L_HPAT" in %s",
				  DB_TYPE2STR(rcd_type),
				  db_sts.rcd.s.rptr, db_nm);
			return 0;
		}

		rcd_ck->prev = DB_PTR_CP(DB_PTR_NULL);

		/* Do not link paths or whitelist file and line numbers */
		if (rcd_type == DCC_CK_FLOD_PATH) {
			DB_TGTS_CK_SET(rcd_ck, 0);
			continue;
		}

		/* Do not link or total some checksums unless they are
		 * whitelist entries.  If they are whitelist entries, they
		 * will eventually get set to DCC_TGTS_OK or DCC_TGTS_OK2.
		 * Blacklist entries are noticed later by server-ID
		 * or do not matter DCC_TGTS_TOO_MANY. */
		if (DB_TEST_NOKEEP(db_parms.nokeep_cks, rcd_type)
		    && DB_RCD_ID(rcd) != DCC_ID_WHITE) {
			DB_TGTS_CK_SET(rcd_ck, 1);
			continue;
		}

		res = (rcd_tgts == DCC_TGTS_DEL) ? 0 : rcd_tgts;

		switch (db_lookup(emsg, rcd_type, rcd_ck->sum, lo, hi,
				  &db_sts.hash, &db_sts.rcd2, &prev_ck)) {
		case DB_FOUND_SYSERR:
			return 0;

		case DB_FOUND_LATER:
			continue;

		case DB_FOUND_IT:
			/* We found the checksum
			 * Update the hash table to point to the new record */
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			SET_FLUSH_HE(&db_sts.hash);
			/* link new record to existing record */
			rcd_ck->prev = DB_PTR_CP(db_sts.rcd2.s.rptr);

			/* delete predecessors to a delete request
			 * and compute the remaining sum */
			if (rcd_tgts == DCC_TGTS_DEL) {
				if (!del_ck(emsg, &res, rcd, rcd_type,
					    prev_ck, &db_sts.rcd2))
					return 0;
				/* delete requests are obsolete if the
				 * checksum is whitelisted */
				if (res == DCC_TGTS_OK
				    || res == DCC_TGTS_OK2)
					rcd_ck->type_fgs |= DB_CK_FG_OBS;
				break;
			}

			/* Simple checksum with a predecessor
			 * This does not do the substantial extra work
			 * to notice all delete requests that arrived early.
			 * That problem is handled by the incoming flood
			 * duplicate report detection mechanism.
			 * We must detect precessors that were deleted because
			 * they are partial duplicates of the new record. */
			prev_ck_tgts = DB_TGTS_CK(prev_ck);
			if (DB_RCD_SUMRY(rcd))
				res = prev_ck_tgts;
			else
				res = db_sum_ck(prev_ck_tgts, res, rcd_type);
			if ((res == DCC_TGTS_OK || res == DCC_TGTS_OK2
			     || (DB_RCD_ID(db_sts.rcd2.d.r) == DCC_ID_WHITE))
			    && DB_RCD_ID(rcd) != DCC_ID_WHITE){
				/* obsolete whitelisted checksums */
				rcd_ck->type_fgs |= DB_CK_FG_OBS;
				break;
			}
			if (res == DCC_TGTS_TOO_MANY) {
				/* mark obsolete unneeded reports of spam */
				if (!DB_CK_OBS(rcd_ck)
				    && !ck_obs_spam(emsg, rcd, rcd_tgts,
						    rcd_ck, rcd_type,
						    prev_ck, prev_ck_tgts,
						    &keeping_new))
					return 0;   /* (broken database) */
			} else if (rcd_type == DCC_CK_SRVR_ID) {
				/* mark obsolete server-ID assertions */
				if (!DB_CK_OBS(rcd_ck)
				    && !srvr_id_ck(emsg, rcd, rcd_ck, prev_ck))
					return 0;   /* (broken database) */
			}
			break;

		case DB_FOUND_EMPTY:
			/* We found an empty hash table slot.
			 * Update the slot to point to our new record
			 * after removing it from the free list,
			 * which marks it dirty. */
			if (!unlink_free_hash(emsg, &db_sts.hash))
				return 0;
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
			if (res >= BULK_THRESHOLD)
				keeping_new = 1;
			break;

		case DB_FOUND_CHAIN:
			/* We found a hash collision, a chain of 1 or more
			 * records with the same hash value.
			 * Get a free slot, link it to the end of the
			 * existing chain, and point it to the new record.
			 * The buffer containing the free slot is marked
			 * dirty when it is removed from the free list. */
			if (!get_free_hash(emsg, db_sts.hash.s.haddr))
				return 0;
			DB_HADDR_CP(db_sts.free.d.h->bak, db_sts.hash.s.haddr);
			DB_HADDR_CP(db_sts.hash.d.h->fwd, db_sts.free.s.haddr);
			DB_HPTR_CP(db_sts.free.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.free.d.h,rcd_type, rcd_ck->sum);
			SET_FLUSH_HE(&db_sts.hash);
			if (res >= BULK_THRESHOLD)
				keeping_new = 1;
			break;

		case DB_FOUND_INTRUDER:
			/* The home hash slot for our key contains an
			 * intruder.  Move it to a new free slot */
			if (!get_free_hash(emsg, db_sts.hash.s.haddr))
				return 0;
			*db_sts.free.d.h = *db_sts.hash.d.h;
			/* re-link the neighbors of the intruder */
			haddr = DB_HADDR_EX(db_sts.free.d.h->bak);
			if (haddr == DB_HADDR_NULL) {
				dcc_pemsg(EX_DATAERR, emsg,
					  "bad hash chain reverse link at %#x"
					  " in %s",
					  haddr, db_hash_nm);
				return 0;
			}
			if (!map_hash(emsg, haddr, &db_sts.tmp, 0))
				return 0;
			DB_HADDR_CP(db_sts.tmp.d.h->fwd, db_sts.free.s.haddr);
			SET_FLUSH_HE(&db_sts.tmp);
			haddr = DB_HADDR_EX(db_sts.hash.d.h->fwd);
			if (haddr != DB_HADDR_NULL) {
				if (!map_hash(emsg, haddr, &db_sts.tmp, 0))
					return 0;
				DB_HADDR_CP(db_sts.tmp.d.h->bak,
					    db_sts.free.s.haddr);
				SET_FLUSH_HE(&db_sts.tmp);
			}
			/* install the new entry in its home slot */
			DB_HADDR_CP(db_sts.hash.d.h->fwd, DB_HADDR_NULL);
			DB_HADDR_CP(db_sts.hash.d.h->bak, DB_HADDR_NULL);
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
			SET_FLUSH_HE(&db_sts.hash);
			if (res >= BULK_THRESHOLD)
				keeping_new = 1;
			break;
		}

		/* Fix the checksum's total in the record */
		DB_TGTS_CK_SET(rcd_ck, res);
		SET_FLUSH_RCD(&db_sts.rcd, 0);
	}

	return db_set_sizes(emsg);
}



/* Add a record to the database and the hash table
 *	The record must be known to be valid
 *	Use db_sts.rcd, db_sts.hash, db_sts.rcd2, db_sts.free, db_sts.tmp
 *	On exit db_sts.rcd points to the new record in the database */
DB_PTR					/* 0=failed */
db_add_rcd(DCC_EMSG emsg, const DB_RCD *new_rcd)
{
	u_int new_rcd_len, pad_len;
	DB_PTR new_db_csize, rcd_pos, new_page_num;
	DB_BUF *b;

	if (!db_make_dirty(emsg))
		return 0;

	new_rcd_len = (sizeof(*new_rcd)
		       - sizeof(new_rcd->cks)
		       + (DB_NUM_CKS(new_rcd) * sizeof(new_rcd->cks[0])));

	rcd_pos = db_csize;
	new_db_csize = rcd_pos+new_rcd_len;

	new_page_num = DB_PTR2PG_NUM(new_db_csize, db_pagesize);
	if (new_page_num == DB_PTR2PG_NUM(db_csize, db_pagesize)) {
		if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0))
			return 0;

	} else {
		/* fill with zeros to get past a page boundary. */
		pad_len = new_page_num*db_pagesize - db_csize;
		pad_len = (((pad_len + DB_RCD_HDR_LEN-1) / DB_RCD_HDR_LEN)
			   * DB_RCD_HDR_LEN);
		if (pad_len != 0) {
			if (!map_db(emsg, db_csize, pad_len, &db_sts.rcd, 0))
				return 0;
			memset(db_sts.rcd.d.r, 0, pad_len);
			db_set_flush(&db_sts.rcd, 1, pad_len);
			db_csize += pad_len;

			rcd_pos = db_csize;
			new_db_csize = rcd_pos+new_rcd_len;
		}

		/* extend the file by writing a full page to it with write(),
		 * because extending by mmap() often does not work */
		db_fsize = db_csize+db_pagesize;
		if (!map_db(emsg, rcd_pos, db_pagesize, &db_sts.rcd, 1))
			return 0;
		b = db_sts.rcd.b;
		b->flush = (DB_BUF_FM)-1;

		/* push new page to disk if dblist or dbclean is running */
		if (db_minimum_map) {
			rel_db_state(&db_sts.rcd);
			if (!buf_munmap(emsg, b))
				return 0;
			if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd, 0))
				return 0;
		}
	}

	/* install the record */
	memcpy(db_sts.rcd.d.r, new_rcd, new_rcd_len);
	/* Mark its buffer to be sent to the disk to keep the database
	 * as good as possible even if we crash.  We don't need to worry
	 * about later changes to the hash links because dbclean will
	 * rebuild them if we crash */
	db_set_flush(&db_sts.rcd, 1, new_rcd_len);
	db_csize = new_db_csize;

	/* install pointers in the hash table
	 * and update the total counts in the record */
	if (!db_link_rcd(emsg, 0, MAX_HASH_ENTRIES))
		return 0;

	++db_stats.adds;
	return rcd_pos;
}