[Dovecot] New indexing code
Timo Sirainen
tss at iki.fi
Mon Jul 14 14:08:18 EEST 2003
If someone's interested, here's an explanation and test code how I was
planning on replacing the current .imap.index.tree file's
functionality. The .tree file will be removed so you'll save some
disk/memory as well, something like 5-20% of whole indexes.
-------------- next part --------------
/*
IMAP uses two ways to access messages. Message sequence numbers (MSN) and
unique identifiers (UID). Whenever message is expunged, all the MSNs after
it will be decreased by one.
The problem is how to handle message expunges efficiently so that we
minimize disk I/O at expunge time, but also keep MSN and UID lookups fast
later.
Easiest way to do this would be to simply compress away the expunged records
by moving non-expunged records over them. This would keep MSN lookups always
as a simple array lookup, but it might also mean a lot of disk I/O when
expunging old messages in a large mailbox.
What we do is build a binary tree of the expunged records. The binary tree
records are stored over the expunged records, so they don't take any extra
space. Each node in the btree contains the amount of expunge-nodes on it's
left. That means that the next unexpunged record's MSN is the
current_record_index+1 - left_node_count. By traversing the expunge tree
we can find the wanted MSN pretty fast.
UID lookups would be easiest to do using a binary search which just skips
the expunged nodes. However nearly always we want the UID's MSN returned as
well, so it gets slightly more complex. What we do is to traverse the
expunge tree again and find the two nodes which contain our wanted UID
between them, after that we'll do a binary search between them. MSN is again
index+1 - left_node_count.
The expunged records should still be compressed away once in a while.
Something like when 10% of file contains expunged records, or when a lookup
has to traverse through more than 10 nodes.
There's several optimizations that can be done with this. All MSN lookups
up until first expunged message are a simple array lookup. Expunging last
few messages should rather be directly compressed instead of adding to
expunge tree since they both require pretty much the same amount of disk I/O.
The expunge tree should preferrably be somewhat balanced, but currently we
don't do that. The balancing should anyway be done so that the last messages
are the fastest to find, first messages the next fastest and messages in the
middle the slowest.
*/
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <assert.h>
#define TRUE 1
#define FALSE 0
#define bufsize 12345
static size_t page_size = 4096;
struct mail_index {
unsigned char *buf;
size_t size, used;
unsigned int expunge_root_idx, count;
};
struct mail_index_record {
unsigned int flags;
unsigned int uid;
unsigned int offset;
};
struct mail_index_expunge_record {
unsigned int nodes_left;
unsigned int left_idx;
unsigned int right_idx;
};
/*union mail_index_union_record {
struct mail_index_record rec;
struct mail_index_expunge_record erec;
};*/
#define MAIL_FLAG_EXPUNGED (1U << 31)
static void index_set_corrupted(struct mail_index *index, const char *fmt, ...)
{
va_list va;
va_start(va, fmt);
vfprintf(stderr, fmt, va);
fprintf(stderr, "\n");
va_end(va);
abort();
}
static int remove_record(struct mail_index *index, unsigned int idx)
{
struct mail_index_record *rec =
(struct mail_index_record *) index->buf;
struct mail_index_expunge_record *erec, *new_erec;
unsigned int cur_idx, idx_limit;
idx_limit = index->size / sizeof(struct mail_index_record);
assert(idx < idx_limit);
assert((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0);
index->count--;
#if 0
if (index->size < page_size*2 ||
idx * sizeof(struct mail_index_record) >=
index->size - page_size - (index->size % page_size)) {
/* if it's only two pageful, just rewrite them */
/* FIXME: it may have expunged records already which we need to
compress as well */
memmove(&rec[idx], &rec[idx+1],
index->size -
(idx+1) * sizeof(struct mail_index_record));
index->size -= sizeof(struct mail_index_record);
return TRUE;
}
#endif
new_erec = (struct mail_index_expunge_record *) &rec[idx];
if (idx > index->expunge_root_idx) {
/* keep the root node always the highest */
erec = (struct mail_index_expunge_record *)
&rec[index->expunge_root_idx];
new_erec->nodes_left = erec->nodes_left + 1;
new_erec->left_idx = index->expunge_root_idx;
new_erec->right_idx = (unsigned int)-1;
index->expunge_root_idx = idx;
return TRUE;
}
new_erec->nodes_left = MAIL_FLAG_EXPUNGED;
new_erec->left_idx = (unsigned int)-1;
new_erec->right_idx = (unsigned int)-1;
if (index->expunge_root_idx == (unsigned int)-1) {
index->expunge_root_idx = idx;
return TRUE;
}
cur_idx = index->expunge_root_idx;
for (;;) {
if (cur_idx >= idx_limit) {
index_set_corrupted(index,
"Expunge node points outside index (%u >= %u)",
idx, idx_limit);
return FALSE;
}
erec = (struct mail_index_expunge_record *) &rec[cur_idx];
if (idx < cur_idx) {
/* move left */
erec->nodes_left++;
if (erec->left_idx == (unsigned int)-1) {
erec->left_idx = idx;
break;
}
cur_idx = erec->left_idx;
} else {
/* move right */
if (erec->right_idx == (unsigned int)-1) {
erec->right_idx = idx;
break;
}
cur_idx = erec->right_idx;
}
}
return TRUE;
}
static struct mail_index_record *
get_next(struct mail_index *index, unsigned int idx)
{
struct mail_index_record *rec =
(struct mail_index_record *) index->buf;
unsigned int idx_limit;
idx_limit = index->size / sizeof(struct mail_index_record);
if (idx >= idx_limit) {
/* last one */
return NULL;
}
while (++idx < idx_limit) {
if ((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0)
return rec + idx;
}
return NULL;
}
static struct mail_index_record *
get_prev(struct mail_index *index, unsigned int idx)
{
struct mail_index_record *rec, *first;
if (idx == 0)
return NULL;
first = (struct mail_index_record *) index->buf;
rec = ((struct mail_index_record *) index->buf) + idx - 1;
while ((rec->flags & MAIL_FLAG_EXPUNGED) != 0) {
if (rec == first)
return NULL;
rec--;
}
return rec;
}
static struct mail_index_record *
get_nth_record(struct mail_index *index, unsigned int seq)
{
struct mail_index_expunge_record *erec;
struct mail_index_record *rec, *rec_p;
unsigned int idx, idx_limit, left_count, next_seq, cur_nodes_left;
assert(seq != 0);
if (seq > index->count)
return NULL;
rec_p = (struct mail_index_record *) index->buf;
if (index->expunge_root_idx == (unsigned int)-1)
return rec_p + (seq-1);
idx_limit = index->size / sizeof(struct mail_index_record);
idx = index->expunge_root_idx; left_count = 0;
for (;;) {
if (idx >= idx_limit) {
index_set_corrupted(index,
"Expunge node points outside index (%u >= %u)",
idx, idx_limit);
return NULL;
}
if ((rec_p[idx].flags & MAIL_FLAG_EXPUNGED) == 0) {
index_set_corrupted(index,
"Expunge node doesn't have EXPUNGE flag set");
return NULL;
}
erec = (struct mail_index_expunge_record *) &rec_p[idx];
cur_nodes_left = erec->nodes_left & 0x7fffffff;
if (cur_nodes_left > idx + left_count) {
index_set_corrupted(index,
"Expunge left node counters corrupted "
"(%u > %u + %u)",
cur_nodes_left, idx, left_count);
return NULL;
}
next_seq = idx - (left_count + cur_nodes_left) + 1;
if (seq > next_seq) {
/* it's right from us */
if (erec->right_idx == (unsigned int)-1) {
rec = get_next(index, idx);
return rec + (seq - next_seq);
}
if (erec->right_idx <= idx) {
index_set_corrupted(index,
"Invalid right expunge node (%u <= %u)",
erec->right_idx, idx);
return NULL;
}
left_count += cur_nodes_left + 1;
idx = erec->right_idx;
} else if (seq < next_seq) {
/* it's left from us */
if (erec->left_idx == (unsigned int)-1) {
rec = get_prev(index, idx);
return rec - (next_seq - seq - 1);
}
if (erec->left_idx >= idx) {
index_set_corrupted(index,
"Invalid left expunge node (%u >= %u)",
erec->left_idx, idx);
return NULL;
}
idx = erec->left_idx;
} else {
rec = get_next(index, idx);
if (rec == NULL) {
index_set_corrupted(index,
"No records found after idx %u", idx);
}
return rec;
}
}
}
static struct mail_index_record *
get_uid_range(struct mail_index *index, unsigned int uid1, unsigned int uid2,
unsigned int *seq_r)
{
struct mail_index_expunge_record *erec;
struct mail_index_record *rec_p, *rec;
unsigned int idx_limit, idx, left_idx, right_idx, nodes_left;
unsigned int left_next_idx, right_prev_idx;
assert(uid1 <= uid2);
if (seq_r != NULL) *seq_r = 0;
rec_p = (struct mail_index_record *) index->buf;
idx_limit = index->size / sizeof(struct mail_index_record);
idx = index->expunge_root_idx; nodes_left = 0;
left_idx = right_idx = (unsigned int)-1;
while (idx != (unsigned int)-1) {
if (idx >= idx_limit) {
index_set_corrupted(index,
"Expunge node points outside index (%u >= %u)",
idx, idx_limit);
return NULL;
}
erec = (struct mail_index_expunge_record *) &rec_p[idx];
rec = get_prev(index, idx);
if (rec == NULL || rec->uid < uid1) {
left_idx = idx;
nodes_left += (erec->nodes_left & 0x7fffffff) + 1;
idx = erec->right_idx;
} else {
right_idx = idx;
right_prev_idx = (unsigned int) (rec - rec_p);
idx = erec->left_idx;
}
}
if (left_idx == (unsigned int)-1)
left_next_idx = 0;
else {
rec = get_next(index, left_idx);
if (rec == NULL || rec->uid > uid2)
return NULL;
left_next_idx = (unsigned int) (rec - rec_p);
}
if (right_idx == (unsigned int)-1)
right_prev_idx = idx_limit-1;
/* FIXME: use binary search here */
for (idx = left_next_idx; idx <= right_prev_idx; idx++) {
if ((rec->flags & MAIL_FLAG_EXPUNGED) != 0) {
index_set_corrupted(index,
"Unexpectedly found expunged record");
return NULL;
}
if (rec_p[idx].uid >= uid1) {
if (rec_p[idx].uid > uid2)
break;
if (seq_r != NULL)
*seq_r = idx + 1 - nodes_left;
return rec_p + idx;
}
}
return NULL;
}
static struct mail_index_record *
get_uid_range_slow(struct mail_index *index, unsigned int uid1, unsigned int uid2,
unsigned int *seq_r)
{
struct mail_index_record *rec =
(struct mail_index_record *) index->buf;
unsigned int seq, idx, idx_limit;
idx_limit = index->size / sizeof(struct mail_index_record); seq = 0;
for (idx = 0; idx < idx_limit; idx++) {
if ((rec[idx].flags & MAIL_FLAG_EXPUNGED) == 0) {
seq++;
if (rec[idx].uid >= uid1) {
if (rec[idx].uid > uid2)
break;
*seq_r = seq;
return rec+idx;
}
}
}
*seq_r = 0;
return NULL;
}
int main(int argc, char *argv[])
{
struct mail_index index = { 0 };
struct mail_index_record *rec_p;
unsigned int seq1, seq2, uid1, uid2;
size_t i, j;
index.expunge_root_idx = (unsigned int)-1;
index.size = bufsize * sizeof(struct mail_index_record);
printf("count = %u, malloc = %lu\n", bufsize, index.size);
index.buf = calloc(1, index.size);
rec_p = (struct mail_index_record *) index.buf;
for (i = 0; i < bufsize; i++)
rec_p[i].uid = i+1;
index.count = bufsize;
for (i = 0; i < bufsize-1; i++) {
rec_p = get_nth_record(&index, rand() % index.count + 1);
remove_record(&index, rec_p - (struct mail_index_record *) index.buf);
for (j = 0; j < 100; j++) {
seq1 = rand() % index.count + 1;
uid1 = get_nth_record(&index, seq1)->uid;
uid2 = get_uid_range(&index, uid1, uid1, &seq2)->uid;
if (uid1 != uid2 || seq1 != seq2) {
printf("seq %u -> uid %u, uid -> seq %u, uid %u\n",
seq1, uid1, seq2, uid2);
get_uid_range(&index, uid1, uid1, &seq2)->uid;
abort();
}
}
for (j = 0; j < 100; j++) {
uid1 = rand() % bufsize + 1;
uid2 = rand() % (bufsize - uid1) + uid1;
rec_p = get_uid_range(&index, uid1, uid2, &seq1);
if (rec_p != NULL) {
uid1 = rec_p->uid;
uid2 = get_nth_record(&index, seq1)->uid;
if (uid1 != uid2) {
printf("uid: %u vs %u, seq = %u\n",
uid1, uid2, seq1);
abort();
}
}
rec_p = get_uid_range_slow(&index, uid1, uid2, &seq2);
if (seq1 != seq2) {
printf("slow way %u..%u: %u vs %u, uid = %u\n",
uid1, uid2, seq1, seq2, rec_p->uid);
get_uid_range(&index, uid1, uid2, &seq1);
abort();
}
}
}
assert(index.count == 1);
return 0;
}
More information about the dovecot
mailing list