[Midnightbsd-cvs] src [8995] trunk: zfs: illumos fixes for single copy arc, ztest race condition.

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Thu Sep 29 21:31:54 EDT 2016


Revision: 8995
          http://svnweb.midnightbsd.org/src/?rev=8995
Author:   laffer1
Date:     2016-09-29 21:31:53 -0400 (Thu, 29 Sep 2016)
Log Message:
-----------
zfs: illumos fixes for single copy arc, ztest race condition.

Modified Paths:
--------------
    trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h

Modified: trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c	2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c	2016-09-30 01:31:53 UTC (rev 8995)
@@ -121,8 +121,8 @@
 #include <sys/fs/zfs.h>
 #include <libnvpair.h>
 
-#define	ZTEST_FD_DATA 3
-#define	ZTEST_FD_RAND 4
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
 
 typedef struct ztest_shared_hdr {
 	uint64_t	zh_hdr_size;
@@ -713,14 +713,17 @@
 	    UINT64_MAX >> 2);
 
 	if (strlen(altdir) > 0) {
-		char cmd[MAXNAMELEN];
-		char realaltdir[MAXNAMELEN];
+		char *cmd;
+		char *realaltdir;
 		char *bin;
 		char *ztest;
 		char *isa;
 		int isalen;
 
-		(void) realpath(getexecname(), cmd);
+		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+		VERIFY(NULL != realpath(getexecname(), cmd));
 		if (0 != access(altdir, F_OK)) {
 			ztest_dump_core = B_FALSE;
 			fatal(B_TRUE, "invalid alternate ztest path: %s",
@@ -751,6 +754,9 @@
 			fatal(B_TRUE, "invalid alternate lib directory %s",
 			    zo->zo_alt_libpath);
 		}
+
+		umem_free(cmd, MAXPATHLEN);
+		umem_free(realaltdir, MAXPATHLEN);
 	}
 }
 
@@ -767,10 +773,12 @@
 {
 	uint64_t r;
 
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
 	if (range == 0)
 		return (0);
 
-	if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r))
+	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
@@ -4836,7 +4844,18 @@
 			if (islog)
 				(void) rw_unlock(&ztest_name_lock);
 		} else {
+			/*
+			 * Ideally we would like to be able to randomly
+			 * call vdev_[on|off]line without holding locks
+			 * to force unpredictable failures but the side
+			 * effects of vdev_[on|off]line prevent us from
+			 * doing so. We grab the ztest_vdev_lock here to
+			 * prevent a race between injection testing and
+			 * aux_vdev removal.
+			 */
+			VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 			(void) vdev_online(spa, guid0, 0, NULL);
+			VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		}
 	}
 
@@ -5795,29 +5814,16 @@
 }
 
 static void
-setup_fds(void)
+setup_data_fd(void)
 {
-	int fd;
-#ifdef illumos
+	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
 
-	char *tmp = tempnam(NULL, NULL);
-	fd = open(tmp, O_RDWR | O_CREAT, 0700);
-	ASSERT3U(fd, ==, ZTEST_FD_DATA);
-	(void) unlink(tmp);
-	free(tmp);
-#else
-	char tmp[MAXPATHLEN];
+	ztest_fd_data = mkstemp(ztest_name_data);
+	ASSERT3S(ztest_fd_data, >=, 0);
+	(void) unlink(ztest_name_data);
+}
 
-	strlcpy(tmp, ztest_opts.zo_dir, MAXPATHLEN);
-	strlcat(tmp, "/ztest.XXXXXX", MAXPATHLEN);
-	fd = mkstemp(tmp);
-	ASSERT3U(fd, ==, ZTEST_FD_DATA);
-#endif
 
-	fd = open("/dev/urandom", O_RDONLY);
-	ASSERT3U(fd, ==, ZTEST_FD_RAND);
-}
-
 static int
 shared_data_size(ztest_shared_hdr_t *hdr)
 {
@@ -5838,15 +5844,11 @@
 	int size;
 	ztest_shared_hdr_t *hdr;
 
-#ifndef illumos
-	pwrite(ZTEST_FD_DATA, "", 1, 0);
-#endif
-
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
-	    PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
-	VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t)));
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
 
 	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
 	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
@@ -5857,7 +5859,7 @@
 	hdr->zh_ds_count = ztest_opts.zo_datasets;
 
 	size = shared_data_size(hdr);
-	VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size));
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 }
@@ -5870,7 +5872,7 @@
 	uint8_t *buf;
 
 	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
-	    PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0);
+	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 
 	size = shared_data_size(hdr);
@@ -5877,7 +5879,7 @@
 
 	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
-	    PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
 	ASSERT(hdr != MAP_FAILED);
 	buf = (uint8_t *)hdr;
 
@@ -5896,12 +5898,13 @@
 {
 	pid_t pid;
 	int status;
-	char cmdbuf[MAXPATHLEN];
+	char *cmdbuf = NULL;
 
 	pid = fork();
 
 	if (cmd == NULL) {
-		(void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf));
+		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
 		cmd = cmdbuf;
 	}
 
@@ -5910,9 +5913,16 @@
 
 	if (pid == 0) {	/* child */
 		char *emptyargv[2] = { cmd, NULL };
+		char fd_data_str[12];
 
 		struct rlimit rl = { 1024, 1024 };
 		(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+		(void) close(ztest_fd_rand);
+		VERIFY3U(11, >=,
+		    snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+		VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
 		(void) enable_extended_FILE_stdio(-1, -1);
 		if (libpath != NULL)
 			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
@@ -5925,6 +5935,11 @@
 		fatal(B_TRUE, "exec failed: %s", cmd);
 	}
 
+	if (cmdbuf != NULL) {
+		umem_free(cmdbuf, MAXPATHLEN);
+		cmd = NULL;
+	}
+
 	while (waitpid(pid, &status, 0) != pid)
 		continue;
 	if (statusp != NULL)
@@ -5989,25 +6004,27 @@
 	char timebuf[100];
 	char numbuf[6];
 	spa_t *spa;
-	char cmd[MAXNAMELEN];
+	char *cmd;
 	boolean_t hasalt;
+	char *fd_data_str = getenv("ZTEST_FD_DATA");
 
-	boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR));
-	ASSERT(ischild || errno == EBADF);
-
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	dprintf_setup(&argc, argv);
 
-	if (!ischild) {
+	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
+	if (!fd_data_str) {
 		process_options(argc, argv);
 
-		setup_fds();
+		setup_data_fd();
 		setup_hdr();
 		setup_data();
 		bcopy(&ztest_opts, ztest_shared_opts,
 		    sizeof (*ztest_shared_opts));
 	} else {
+		ztest_fd_data = atoi(fd_data_str);
 		setup_data();
 		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
 	}
@@ -6014,14 +6031,14 @@
 	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
 
 	/* Override location of zpool.cache */
-	(void) asprintf((char **)&spa_config_path, "%s/zpool.cache",
-	    ztest_opts.zo_dir);
+	VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+	    ztest_opts.zo_dir), !=, -1);
 
 	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
 	    UMEM_NOFAIL);
 	zs = ztest_shared;
 
-	if (ischild) {
+	if (fd_data_str) {
 		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
 		metaslab_df_alloc_threshold =
 		    zs->zs_metaslab_df_alloc_threshold;
@@ -6044,7 +6061,8 @@
 		    (u_longlong_t)ztest_opts.zo_time);
 	}
 
-	(void) strlcpy(cmd, getexecname(), sizeof (cmd));
+	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
 
 	zs->zs_do_init = B_TRUE;
 	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
@@ -6185,5 +6203,7 @@
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
+	umem_free(cmd, MAXNAMELEN);
+
 	return (0);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2016-09-30 01:31:53 UTC (rev 8995)
@@ -192,6 +192,7 @@
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
 
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
@@ -322,7 +323,6 @@
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_hdr_size;
-	kstat_named_t arcstat_memory_throttle_count;
 	kstat_named_t arcstat_l2_write_trylock_fail;
 	kstat_named_t arcstat_l2_write_passed_headroom;
 	kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -335,6 +335,10 @@
 	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
 	kstat_named_t arcstat_l2_write_buffer_list_iter;
 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
+	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_duplicate_buffers;
+	kstat_named_t arcstat_duplicate_buffers_size;
+	kstat_named_t arcstat_duplicate_reads;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -392,7 +396,6 @@
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
 	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
 	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
@@ -404,7 +407,11 @@
 	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
+	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
+	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
+	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -1517,6 +1524,17 @@
 	hdr->b_buf = buf;
 	arc_get_data_buf(buf);
 	bcopy(from->b_data, buf->b_data, size);
+
+	/*
+	 * This buffer already exists in the arc so create a duplicate
+	 * copy for the caller.  If the buffer is associated with user data
+	 * then track the size and number of duplicates.  These stats will be
+	 * updated as duplicate buffers are created and destroyed.
+	 */
+	if (hdr->b_type == ARC_BUFC_DATA) {
+		ARCSTAT_BUMP(arcstat_duplicate_buffers);
+		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+	}
 	hdr->b_datacnt += 1;
 	return (buf);
 }
@@ -1617,6 +1635,16 @@
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
+
+		/*
+		 * If we're destroying a duplicate buffer make sure
+		 * that the appropriate statistics are updated.
+		 */
+		if (buf->b_hdr->b_datacnt > 1 &&
+		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+		}
 		ASSERT(buf->b_hdr->b_datacnt > 0);
 		buf->b_hdr->b_datacnt -= 1;
 	}
@@ -1803,6 +1831,48 @@
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr;
+	boolean_t evict_needed = B_FALSE;
+
+	if (zfs_disable_dup_eviction)
+		return (B_FALSE);
+
+	mutex_enter(&buf->b_evict_lock);
+	hdr = buf->b_hdr;
+	if (hdr == NULL) {
+		/*
+		 * We are in arc_do_user_evicts(); let that function
+		 * perform the eviction.
+		 */
+		ASSERT(buf->b_data == NULL);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_FALSE);
+	} else if (buf->b_data == NULL) {
+		/*
+		 * We have already been added to the arc eviction list;
+		 * recommend eviction.
+		 */
+		ASSERT3P(hdr, ==, &arc_eviction_hdr);
+		mutex_exit(&buf->b_evict_lock);
+		return (B_TRUE);
+	}
+
+	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+		evict_needed = B_TRUE;
+
+	mutex_exit(&buf->b_evict_lock);
+	return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -2888,8 +2958,10 @@
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
 		if (acb->acb_done) {
-			if (abuf == NULL)
+			if (abuf == NULL) {
+				ARCSTAT_BUMP(arcstat_duplicate_reads);
 				abuf = arc_buf_clone(buf);
+			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
 		}
@@ -3408,6 +3480,16 @@
 			ASSERT3U(*size, >=, hdr->b_size);
 			atomic_add_64(size, -hdr->b_size);
 		}
+
+		/*
+		 * We're releasing a duplicate user data buffer, update
+		 * our statistics accordingly.
+		 */
+		if (hdr->b_type == ARC_BUFC_DATA) {
+			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+			    -hdr->b_size);
+		}
 		hdr->b_datacnt -= 1;
 		arc_cksum_verify(buf);
 #ifdef illumos

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	2016-09-30 01:31:53 UTC (rev 8995)
@@ -2071,7 +2071,24 @@
 			dbuf_evict(db);
 		} else {
 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			if (!DBUF_IS_CACHEABLE(db))
+
+			/*
+			 * A dbuf will be eligible for eviction if either the
+			 * 'primarycache' property is set or a duplicate
+			 * copy of this buffer is already cached in the arc.
+			 *
+			 * In the case of the 'primarycache' a buffer
+			 * is considered for eviction if it matches the
+			 * criteria set in the property.
+			 *
+			 * To decide if our buffer is considered a
+			 * duplicate, we must call into the arc to determine
+			 * if multiple buffers are referencing the same
+			 * block on-disk. If so, then we simply evict
+			 * ourselves.
+			 */
+			if (!DBUF_IS_CACHEABLE(db) ||
+			    arc_buf_eviction_needed(db->db_buf))
 				dbuf_clear(db);
 			else
 				mutex_exit(&db->db_mtx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2016-09-30 01:31:53 UTC (rev 8995)
@@ -6018,7 +6018,7 @@
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
-	ASSERT(version <= SPA_VERSION);
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
@@ -6559,7 +6559,7 @@
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
-	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
+	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT(version >= spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	2016-09-30 01:31:53 UTC (rev 8995)
@@ -96,6 +96,7 @@
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
+boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif



More information about the Midnightbsd-cvs mailing list