[PATCH] xlog: fix fallocate vs read race

Vladimir Davydov vdavydov.dev at gmail.com
Fri Dec 14 13:29:44 MSK 2018


posix_fallocate(), which is used for preallocating disk space for WAL
files, increases the file size and fills the allocated space with zeros.
The problem is a WAL file may be read by a relay thread at the same time
it is written to. We try to handle the zeroed space in xlog_cursor (see
xlog_cursor_next_tx()), however this turns out to be not enough, because
transactions are written not atomically so it may occur that a writer
writes half a transaction when a reader reads it. Without fallocate, the
reader would stop at EOF until the rest of the transaction is written,
but with fallocate it reads zeroes instead and thinks that the xlog file
is corrupted while actually it is not.

Fix this issue by using fallocate() with FALLOC_FL_KEEP_SIZE flag
instead of posix_fallocate(). With the flag fallocate() won't increase
the file size, it will only allocate disk space beyond EOF.

The test will be added shortly.

Closes #3883
---
https://github.com/tarantool/tarantool/issues/3883
https://github.com/tarantool/tarantool/tree/dv/gh-3883-xlog-fallocate-fix

 CMakeLists.txt            |  2 +-
 src/box/xlog.c            | 32 +++++++++++++++++++-------------
 src/trivia/config.h.cmake |  2 +-
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b43e0c6..c2606de2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,7 +85,7 @@ check_symbol_exists(fdatasync unistd.h HAVE_FDATASYNC)
 check_symbol_exists(pthread_yield pthread.h HAVE_PTHREAD_YIELD)
 check_symbol_exists(sched_yield sched.h HAVE_SCHED_YIELD)
 check_symbol_exists(posix_fadvise fcntl.h HAVE_POSIX_FADVISE)
-check_symbol_exists(posix_fallocate fcntl.h HAVE_POSIX_FALLOCATE)
+check_symbol_exists(fallocate fcntl.h HAVE_FALLOCATE)
 check_symbol_exists(mremap sys/mman.h HAVE_MREMAP)
 
 check_function_exists(sync_file_range HAVE_SYNC_FILE_RANGE)
diff --git a/src/box/xlog.c b/src/box/xlog.c
index 191fadcd..280c036f 100644
--- a/src/box/xlog.c
+++ b/src/box/xlog.c
@@ -996,10 +996,25 @@ xdir_create_xlog(struct xdir *dir, struct xlog *xlog,
 ssize_t
 xlog_fallocate(struct xlog *log, size_t len)
 {
-#ifdef HAVE_POSIX_FALLOCATE
-	int rc = posix_fallocate(log->fd, log->offset + log->allocated, len);
+#ifdef HAVE_FALLOCATE
+	static bool fallocate_not_supported = false;
+	if (fallocate_not_supported)
+		return 0;
+	/*
+	 * Keep the file size, because it is used to sync
+	 * concurrent readers vs the writer: xlog_cursor
+	 * assumes that everything written before EOF is
+	 * valid data.
+	 */
+	int rc = fallocate(log->fd, FALLOC_FL_KEEP_SIZE,
+			   log->offset + log->allocated, len);
 	if (rc != 0) {
-		errno = rc;
+		if (errno == ENOSYS || errno == EOPNOTSUPP) {
+			say_warn("fallocate is not supported, "
+				 "proceeding without it");
+			fallocate_not_supported = true;
+			return 0;
+		}
 		diag_set(SystemError, "%s: can't allocate disk space",
 			 log->filename);
 		return -1;
@@ -1010,7 +1025,7 @@ xlog_fallocate(struct xlog *log, size_t len)
 	(void)log;
 	(void)len;
 	return 0;
-#endif /* HAVE_POSIX_FALLOCATE */
+#endif /* HAVE_FALLOCATE */
 }
 
 /**
@@ -1832,15 +1847,6 @@ xlog_cursor_next_tx(struct xlog_cursor *i)
 		return -1;
 	if (rc > 0)
 		return 1;
-	if (load_u32(i->rbuf.rpos) == 0) {
-		/*
-		 * Space preallocated with xlog_fallocate().
-		 * Treat as eof and clear the buffer.
-		 */
-		i->read_offset -= ibuf_used(&i->rbuf);
-		ibuf_reset(&i->rbuf);
-		return 1;
-	}
 	if (load_u32(i->rbuf.rpos) == eof_marker) {
 		/* eof marker found */
 		goto eof_found;
diff --git a/src/trivia/config.h.cmake b/src/trivia/config.h.cmake
index 53eae2fe..f5c2e2c5 100644
--- a/src/trivia/config.h.cmake
+++ b/src/trivia/config.h.cmake
@@ -166,7 +166,7 @@
 #cmakedefine HAVE_PTHREAD_YIELD 1
 #cmakedefine HAVE_SCHED_YIELD 1
 #cmakedefine HAVE_POSIX_FADVISE 1
-#cmakedefine HAVE_POSIX_FALLOCATE 1
+#cmakedefine HAVE_FALLOCATE 1
 #cmakedefine HAVE_MREMAP 1
 #cmakedefine HAVE_SYNC_FILE_RANGE 1
 
-- 
2.11.0




More information about the Tarantool-patches mailing list