[Dovecot] New indexing code

Timo Sirainen tss at iki.fi
Mon Jul 14 14:08:18 EEST 2003


If someone's interested, here's an explanation and test code how I was 
planning on replacing the current .imap.index.tree file's 
functionality. The .tree file will be removed so you'll save some 
disk/memory as well, something like 5-20% of whole indexes.
-------------- next part --------------
/*
   IMAP uses two ways to access messages. Message sequence numbers (MSN) and
   unique identifiers (UID). Whenever message is expunged, all the MSNs after
   it will be decreased by one.

   The problem is how to handle message expunges efficiently so that we
   minimize disk I/O at expunge time, but also keep MSN and UID lookups fast
   later.

   Easiest way to do this would be to simply compress away the expunged records
   by moving non-expunged records over them. This would keep MSN lookups always
   as a simple array lookup, but it might also mean a lot of disk I/O when
   expunging old messages in a large mailbox.

   What we do is build a binary tree of the expunged records. The binary tree
   records are stored over the expunged records, so they don't take any extra
   space. Each node in the btree contains the amount of expunge-nodes on it's
   left. That means that the next unexpunged record's MSN is the
   current_record_index+1 - left_node_count. By traversing the expunge tree
   we can find the wanted MSN pretty fast.

   UID lookups would be easiest to do using a binary search which just skips
   the expunged nodes. However nearly always we want the UID's MSN returned as
   well, so it gets slightly more complex. What we do is to traverse the
   expunge tree again and find the two nodes which contain our wanted UID
   between them, after that we'll do a binary search between them. MSN is again
   index+1 - left_node_count.

   The expunged records should still be compressed away once in a while.
   Something like when 10% of file contains expunged records, or when a lookup
   has to traverse through more than 10 nodes.

   There's several optimizations that can be done with this. All MSN lookups
   up until first expunged message are a simple array lookup. Expunging last
   few messages should rather be directly compressed instead of adding to
   expunge tree since they both require pretty much the same amount of disk I/O.

   The expunge tree should preferrably be somewhat balanced, but currently we
   don't do that. The balancing should anyway be done so that the last messages
   are the fastest to find, first messages the next fastest and messages in the
   middle the slowest.
*/

#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <assert.h>

#define TRUE 1
#define FALSE 0

#define bufsize 12345

static size_t page_size = 4096;

struct mail_index {
	unsigned char *buf;
	size_t size, used;

	unsigned int expunge_root_idx, count;
};

struct mail_index_record {
	unsigned int flags;
	unsigned int uid;
	unsigned int offset;
};

struct mail_index_expunge_record {
	unsigned int nodes_left;
	unsigned int left_idx;
	unsigned int right_idx;
};

/*union mail_index_union_record {
	struct mail_index_record rec;
	struct mail_index_expunge_record erec;
};*/

#define MAIL_FLAG_EXPUNGED (1U << 31)

static void index_set_corrupted(struct mail_index *index, const char *fmt, ...)
{
	va_list va;

	va_start(va, fmt);
	vfprintf(stderr, fmt, va);
	fprintf(stderr, "\n");
	va_end(va);
	abort();
}

static int remove_record(struct mail_index *index, unsigned int idx)
{
	struct mail_index_record *rec =
		(struct mail_index_record *) index->buf;
	struct mail_index_expunge_record *erec, *new_erec;
	unsigned int cur_idx, idx_limit;

	idx_limit = index->size / sizeof(struct mail_index_record);
	assert(idx < idx_limit);
	assert((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0);

	index->count--;

#if 0
	if (index->size < page_size*2 ||
	    idx * sizeof(struct mail_index_record) >=
	    index->size - page_size - (index->size % page_size)) {
		/* if it's only two pageful, just rewrite them */
		/* FIXME: it may have expunged records already which we need to
		   compress as well */
		memmove(&rec[idx], &rec[idx+1],
			index->size -
			(idx+1) * sizeof(struct mail_index_record));
		index->size -= sizeof(struct mail_index_record);
		return TRUE;
	}
#endif

	new_erec = (struct mail_index_expunge_record *) &rec[idx];

	if (idx > index->expunge_root_idx) {
		/* keep the root node always the highest */
		erec = (struct mail_index_expunge_record *)
			&rec[index->expunge_root_idx];
		new_erec->nodes_left = erec->nodes_left + 1;
		new_erec->left_idx = index->expunge_root_idx;
		new_erec->right_idx = (unsigned int)-1;

		index->expunge_root_idx = idx;
		return TRUE;
	}

	new_erec->nodes_left = MAIL_FLAG_EXPUNGED;
	new_erec->left_idx = (unsigned int)-1;
	new_erec->right_idx = (unsigned int)-1;

	if (index->expunge_root_idx == (unsigned int)-1) {
		index->expunge_root_idx = idx;
		return TRUE;
	}

	cur_idx = index->expunge_root_idx;
	for (;;) {
		if (cur_idx >= idx_limit) {
			index_set_corrupted(index,
				"Expunge node points outside index (%u >= %u)",
				idx, idx_limit);
			return FALSE;
		}

		erec = (struct mail_index_expunge_record *) &rec[cur_idx];
		if (idx < cur_idx) {
			/* move left */
			erec->nodes_left++;

			if (erec->left_idx == (unsigned int)-1) {
				erec->left_idx = idx;
				break;
			}

			cur_idx = erec->left_idx;
		} else {
			/* move right */
			if (erec->right_idx == (unsigned int)-1) {
				erec->right_idx = idx;
				break;
			}

			cur_idx = erec->right_idx;
		}
	}

	return TRUE;
}

static struct mail_index_record *
get_next(struct mail_index *index, unsigned int idx)
{
	struct mail_index_record *rec =
		(struct mail_index_record *) index->buf;
	unsigned int idx_limit;

	idx_limit = index->size / sizeof(struct mail_index_record);
	if (idx >= idx_limit) {
		/* last one */
		return NULL;
	}

	while (++idx < idx_limit) {
		if ((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0)
			return rec + idx;
	}

	return NULL;
}

static struct mail_index_record *
get_prev(struct mail_index *index, unsigned int idx)
{
	struct mail_index_record *rec, *first;

	if (idx == 0)
		return NULL;

	first = (struct mail_index_record *) index->buf;
	rec = ((struct mail_index_record *) index->buf) + idx - 1;

	while ((rec->flags & MAIL_FLAG_EXPUNGED) != 0) {
		if (rec == first)
			return NULL;
		rec--;
	}

	return rec;
}

static struct mail_index_record *
get_nth_record(struct mail_index *index, unsigned int seq)
{
        struct mail_index_expunge_record *erec;
	struct mail_index_record *rec, *rec_p;
	unsigned int idx, idx_limit, left_count, next_seq, cur_nodes_left;

	assert(seq != 0);

	if (seq > index->count)
		return NULL;

	rec_p = (struct mail_index_record *) index->buf;

	if (index->expunge_root_idx == (unsigned int)-1)
		return rec_p + (seq-1);

	idx_limit = index->size / sizeof(struct mail_index_record);
	idx = index->expunge_root_idx; left_count = 0;
	for (;;) {
		if (idx >= idx_limit) {
			index_set_corrupted(index,
				"Expunge node points outside index (%u >= %u)",
				idx, idx_limit);
			return NULL;
		}

		if ((rec_p[idx].flags & MAIL_FLAG_EXPUNGED) == 0) {
			index_set_corrupted(index,
				"Expunge node doesn't have EXPUNGE flag set");
			return NULL;
		}

		erec = (struct mail_index_expunge_record *) &rec_p[idx];
		cur_nodes_left = erec->nodes_left & 0x7fffffff;
		if (cur_nodes_left > idx + left_count) {
			index_set_corrupted(index,
				"Expunge left node counters corrupted "
				"(%u > %u + %u)",
				cur_nodes_left, idx, left_count);
			return NULL;
		}

		next_seq = idx - (left_count + cur_nodes_left) + 1;
		if (seq > next_seq) {
			/* it's right from us */
			if (erec->right_idx == (unsigned int)-1) {
				rec = get_next(index, idx);
				return rec + (seq - next_seq);
			}

			if (erec->right_idx <= idx) {
				index_set_corrupted(index,
					"Invalid right expunge node (%u <= %u)",
					erec->right_idx, idx);
				return NULL;
			}

			left_count += cur_nodes_left + 1;
			idx = erec->right_idx;
		} else if (seq < next_seq) {
			/* it's left from us */
			if (erec->left_idx == (unsigned int)-1) {
				rec = get_prev(index, idx);
				return rec - (next_seq - seq - 1);
			}

			if (erec->left_idx >= idx) {
				index_set_corrupted(index,
					"Invalid left expunge node (%u >= %u)",
					erec->left_idx, idx);
				return NULL;
			}
			idx = erec->left_idx;
		} else {
			rec = get_next(index, idx);
			if (rec == NULL) {
				index_set_corrupted(index,
					"No records found after idx %u", idx);
			}
			return rec;
		}
	}
}

static struct mail_index_record *
get_uid_range(struct mail_index *index, unsigned int uid1, unsigned int uid2,
	      unsigned int *seq_r)
{
        struct mail_index_expunge_record *erec;
	struct mail_index_record *rec_p, *rec;
	unsigned int idx_limit, idx, left_idx, right_idx, nodes_left;
	unsigned int left_next_idx, right_prev_idx;

	assert(uid1 <= uid2);

	if (seq_r != NULL) *seq_r = 0;

	rec_p = (struct mail_index_record *) index->buf;
	idx_limit = index->size / sizeof(struct mail_index_record);

	idx = index->expunge_root_idx; nodes_left = 0;
	left_idx = right_idx = (unsigned int)-1;
	while (idx != (unsigned int)-1) {
		if (idx >= idx_limit) {
			index_set_corrupted(index,
				"Expunge node points outside index (%u >= %u)",
				idx, idx_limit);
			return NULL;
		}

		erec = (struct mail_index_expunge_record *) &rec_p[idx];
		rec = get_prev(index, idx);

		if (rec == NULL || rec->uid < uid1) {
			left_idx = idx;
			nodes_left += (erec->nodes_left & 0x7fffffff) + 1;
			idx = erec->right_idx;
		} else {
			right_idx = idx;
			right_prev_idx = (unsigned int) (rec - rec_p);
			idx = erec->left_idx;
		}
	}

	if (left_idx == (unsigned int)-1)
		left_next_idx = 0;
	else {
		rec = get_next(index, left_idx);
		if (rec == NULL || rec->uid > uid2)
			return NULL;

		left_next_idx = (unsigned int) (rec - rec_p);
	}

	if (right_idx == (unsigned int)-1)
		right_prev_idx = idx_limit-1;

	/* FIXME: use binary search here */
	for (idx = left_next_idx; idx <= right_prev_idx; idx++) {
		if ((rec->flags & MAIL_FLAG_EXPUNGED) != 0) {
			index_set_corrupted(index,
				"Unexpectedly found expunged record");
			return NULL;
		}

		if (rec_p[idx].uid >= uid1) {
			if (rec_p[idx].uid > uid2)
				break;

			if (seq_r != NULL)
				*seq_r = idx + 1 - nodes_left;
			return rec_p + idx;
		}
	}

	return NULL;
}

static struct mail_index_record *
get_uid_range_slow(struct mail_index *index, unsigned int uid1, unsigned int uid2,
		   unsigned int *seq_r)
{
	struct mail_index_record *rec =
		(struct mail_index_record *) index->buf;
	unsigned int seq, idx, idx_limit;

	idx_limit = index->size / sizeof(struct mail_index_record); seq = 0;
	for (idx = 0; idx < idx_limit; idx++) {
		if ((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0) {
			seq++;
			if (rec[idx].uid >= uid1) {
				if (rec[idx].uid > uid2)
					break;
				*seq_r = seq;
				return rec+idx;
			}
		}
	}

	*seq_r = 0;
	return NULL;
}

int main(int argc, char *argv[])
{
	struct mail_index index = { 0 };
	struct mail_index_record *rec_p;
	unsigned int seq1, seq2, uid1, uid2;
	size_t i, j;

	index.expunge_root_idx = (unsigned int)-1;

	index.size = bufsize * sizeof(struct mail_index_record);
	printf("count = %u, malloc = %lu\n", bufsize, index.size);

	index.buf = calloc(1, index.size);
	rec_p = (struct mail_index_record *) index.buf;
	for (i = 0; i < bufsize; i++)
		rec_p[i].uid = i+1;
	index.count = bufsize;

	for (i = 0; i < bufsize-1; i++) {
		rec_p = get_nth_record(&index, rand() % index.count + 1);
		remove_record(&index, rec_p - (struct mail_index_record *) index.buf);

		for (j = 0; j < 100; j++) {
			seq1 = rand() % index.count + 1;
			uid1 = get_nth_record(&index, seq1)->uid;
			uid2 = get_uid_range(&index, uid1, uid1, &seq2)->uid;

			if (uid1 != uid2 || seq1 != seq2) {
				printf("seq %u -> uid %u, uid -> seq %u, uid %u\n",
				       seq1, uid1, seq2, uid2);
				get_uid_range(&index, uid1, uid1, &seq2)->uid;
				abort();
			}
		}

		for (j = 0; j < 100; j++) {
			uid1 = rand() % bufsize + 1;
			uid2 = rand() % (bufsize - uid1) + uid1;

			rec_p = get_uid_range(&index, uid1, uid2, &seq1);
			if (rec_p != NULL) {
				uid1 = rec_p->uid;
				uid2 = get_nth_record(&index, seq1)->uid;

				if (uid1 != uid2) {
					printf("uid: %u vs %u, seq = %u\n",
					       uid1, uid2, seq1);
					abort();
				}
			}

			rec_p = get_uid_range_slow(&index, uid1, uid2, &seq2);
			if (seq1 != seq2) {
				printf("slow way %u..%u: %u vs %u, uid = %u\n",
				       uid1, uid2, seq1, seq2, rec_p->uid);
				get_uid_range(&index, uid1, uid2, &seq1);
				abort();
			}
		}
	}
	assert(index.count == 1);
	return 0;
}


More information about the dovecot mailing list