dovecot-2.2: lib-index: Write to transaction log using O_APPEND ...

dovecot at dovecot.org dovecot at dovecot.org
Sat Nov 3 13:48:06 EET 2012


details:   http://hg.dovecot.org/dovecot-2.2/rev/751181168b72
changeset: 15367:751181168b72
user:      Timo Sirainen <tss at iki.fi>
date:      Sat Nov 03 13:47:55 2012 +0200
description:
lib-index: Write to transaction log using O_APPEND flag.
Most OSes should do the writes atomically so that reads won't see partially
written data. We don't currently rely on this, but it would be possible in
future to remove locking entirely from writing to transaction log.

diffstat:

 src/lib-index/mail-transaction-log-append.c |  53 +++++++++++++---------------
 src/lib-index/mail-transaction-log-file.c   |  24 +++++++------
 src/lib-index/mail-transaction-log.h        |   2 +
 3 files changed, 39 insertions(+), 40 deletions(-)

diffs (193 lines):

diff -r d7239a4554e4 -r 751181168b72 src/lib-index/mail-transaction-log-append.c
--- a/src/lib-index/mail-transaction-log-append.c	Thu Nov 01 12:32:30 2012 +0200
+++ b/src/lib-index/mail-transaction-log-append.c	Sat Nov 03 13:47:55 2012 +0200
@@ -32,6 +32,7 @@
 	buffer_append(ctx->output, data, size);
 
 	mail_transaction_update_modseq(&hdr, data, &ctx->new_highest_modseq);
+	ctx->transaction_count++;
 }
 
 static int
@@ -60,8 +61,6 @@
 static int log_buffer_write(struct mail_transaction_log_append_ctx *ctx)
 {
 	struct mail_transaction_log_file *file = ctx->log->head;
-	struct mail_transaction_header *hdr;
-	uint32_t first_size;
 
 	if (ctx->output->used == 0)
 		return 0;
@@ -76,19 +75,11 @@
 		return 0;
 	}
 
-	/* size will be written later once everything is in disk */
-	hdr = buffer_get_space_unsafe(ctx->output, 0, sizeof(*hdr));
-	first_size = hdr->size;
-	i_assert(first_size != 0);
-	hdr->size = 0;
-
-	if (pwrite_full(file->fd, ctx->output->data, ctx->output->used,
-			file->sync_offset) < 0) {
+	if (write_full(file->fd, ctx->output->data, ctx->output->used) < 0) {
 		/* write failure, fallback to in-memory indexes. */
-		hdr->size = first_size;
 		mail_index_file_set_syscall_error(ctx->log->index,
 						  file->filepath,
-						  "pwrite_full()");
+						  "write_full()");
 		return log_buffer_move_to_memory(ctx);
 	}
 
@@ -96,18 +87,6 @@
 		 file->sync_offset + ctx->output->used ==
 		 file->max_tail_offset);
 
-	/* now that the whole transaction has been written, rewrite the first
-	   record's size so the transaction becomes visible */
-	hdr->size = first_size;
-	if (pwrite_full(file->fd, &first_size, sizeof(uint32_t),
-			file->sync_offset +
-			offsetof(struct mail_transaction_header, size)) < 0) {
-		mail_index_file_set_syscall_error(ctx->log->index,
-						  file->filepath,
-						  "pwrite_full()");
-		return log_buffer_move_to_memory(ctx);
-	}
-
 	if ((ctx->want_fsync &&
 	     file->log->index->fsync_mode != FSYNC_MODE_NEVER) ||
 	    file->log->index->fsync_mode == FSYNC_MODE_ALWAYS) {
@@ -119,11 +98,6 @@
 		}
 	}
 
-	/* FIXME: when we're relying on O_APPEND and someone else wrote a
-	   transaction, we'll need to wait for it to commit its transaction.
-	   if it crashes before doing that, we'll need to overwrite it with
-	   a dummy record */
-
 	if (file->mmap_base == NULL && file->buffer != NULL) {
 		/* we're reading from a file. avoid re-reading the data that
 		   we just wrote. this is also important for some NFS clients,
@@ -184,6 +158,7 @@
 mail_transaction_log_append_locked(struct mail_transaction_log_append_ctx *ctx)
 {
 	struct mail_transaction_log_file *file = ctx->log->head;
+	struct mail_transaction_boundary *boundary;
 
 	if (file->sync_offset < file->last_size) {
 		/* there is some garbage at the end of the transaction log
@@ -199,6 +174,21 @@
 		}
 	}
 
+	/* don't include log_file_tail_offset update in the transaction */
+	boundary = buffer_get_space_unsafe(ctx->output,
+				sizeof(struct mail_transaction_header),
+				sizeof(*boundary));
+	boundary->size = ctx->output->used;
+
+	if (ctx->transaction_count <= 2) {
+		/* 0-1 changes. don't bother with the boundary */
+		unsigned int boundary_size =
+			sizeof(struct mail_transaction_header) +
+			sizeof(*boundary);
+
+		buffer_delete(ctx->output, 0, boundary_size);
+	}
+
 	if (ctx->append_sync_offset)
 		log_append_sync_offset_if_needed(ctx);
 
@@ -213,6 +203,7 @@
 				      struct mail_transaction_log_append_ctx **ctx_r)
 {
 	struct mail_transaction_log_append_ctx *ctx;
+	struct mail_transaction_boundary boundary;
 
 	if (!index->log_sync_locked) {
 		if (mail_transaction_log_lock_head(index->log) < 0)
@@ -223,6 +214,10 @@
 	ctx->output = buffer_create_dynamic(default_pool, 1024);
 	ctx->trans_flags = flags;
 
+	memset(&boundary, 0, sizeof(boundary));
+	mail_transaction_log_append_add(ctx, MAIL_TRANSACTION_BOUNDARY,
+					&boundary, sizeof(boundary));
+
 	*ctx_r = ctx;
 	return 0;
 }
diff -r d7239a4554e4 -r 751181168b72 src/lib-index/mail-transaction-log-file.c
--- a/src/lib-index/mail-transaction-log-file.c	Thu Nov 01 12:32:30 2012 +0200
+++ b/src/lib-index/mail-transaction-log-file.c	Sat Nov 03 13:47:55 2012 +0200
@@ -653,6 +653,11 @@
 	int fd, ret;
 	bool rename_existing;
 
+	if (fcntl(new_fd, F_SETFL, O_APPEND) < 0) {
+		log_file_set_syscall_error(file, "fcntl(O_APPEND)");
+		return -1;
+	}
+
 	if (file->log->nfs_flush) {
 		/* although we check also mtime and file size below, it's done
 		   only to fix broken log files. we don't bother flushing
@@ -684,7 +689,7 @@
 		rename_existing = TRUE;
 	} else {
 		/* recreated. use the file if its header is ok */
-		fd = nfs_safe_open(file->filepath, O_RDWR);
+		fd = nfs_safe_open(file->filepath, O_RDWR | O_APPEND);
 		if (fd == -1) {
 			if (errno != ENOENT) {
 				log_file_set_syscall_error(file, "open()");
@@ -843,8 +848,12 @@
 	int ret;
 
         for (i = 0;; i++) {
-		file->fd = nfs_safe_open(file->filepath,
-					 !index->readonly ? O_RDWR : O_RDONLY);
+		if (!index->readonly) {
+			file->fd = nfs_safe_open(file->filepath,
+						 O_RDWR | O_APPEND);
+		} else {
+			file->fd = nfs_safe_open(file->filepath, O_RDONLY);
+		}
 		if (file->fd == -1 && errno == EACCES) {
 			file->fd = nfs_safe_open(file->filepath, O_RDONLY);
 			index->readonly = TRUE;
@@ -1370,18 +1379,11 @@
 		/* There's more data than we could sync at the moment. If the
 		   last record's size wasn't valid, we can't know if it will
 		   be updated unless we've locked the log. */
-		if (trans_size != 0) {
-			/* pread()s or the above fstat() check for mmaps should
-			   have guaranteed that this doesn't happen */
-			mail_transaction_log_file_set_corrupted(file,
-				"hdr.size too large (%u)", trans_size);
-			return -1;
-		} else if (file->locked) {
+		if (file->locked) {
 			mail_transaction_log_file_set_corrupted(file,
 				"Unexpected garbage at EOF");
 			return -1;
 		}
-
 		/* The size field will be updated soon */
 		mail_index_flush_read_cache(file->log->index, file->filepath,
 					    file->fd, file->locked);
diff -r d7239a4554e4 -r 751181168b72 src/lib-index/mail-transaction-log.h
--- a/src/lib-index/mail-transaction-log.h	Thu Nov 01 12:32:30 2012 +0200
+++ b/src/lib-index/mail-transaction-log.h	Sat Nov 03 13:47:55 2012 +0200
@@ -173,6 +173,8 @@
 	enum mail_transaction_type trans_flags;
 
 	uint64_t new_highest_modseq;
+	unsigned int transaction_count;
+
 	unsigned int append_sync_offset:1;
 	unsigned int sync_includes_this:1;
 	unsigned int want_fsync:1;


More information about the dovecot-cvs mailing list