[Midnightbsd-cvs] src [8995] trunk: zfs: illumos fixes for single copy arc, ztest race condition.
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Thu Sep 29 21:31:54 EDT 2016
Revision: 8995
http://svnweb.midnightbsd.org/src/?rev=8995
Author: laffer1
Date: 2016-09-29 21:31:53 -0400 (Thu, 29 Sep 2016)
Log Message:
-----------
zfs: illumos fixes for single copy arc, ztest race condition.
Modified Paths:
--------------
trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
Modified: trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c 2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/cddl/contrib/opensolaris/cmd/ztest/ztest.c 2016-09-30 01:31:53 UTC (rev 8995)
@@ -121,8 +121,8 @@
#include <sys/fs/zfs.h>
#include <libnvpair.h>
-#define ZTEST_FD_DATA 3
-#define ZTEST_FD_RAND 4
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
typedef struct ztest_shared_hdr {
uint64_t zh_hdr_size;
@@ -713,14 +713,17 @@
UINT64_MAX >> 2);
if (strlen(altdir) > 0) {
- char cmd[MAXNAMELEN];
- char realaltdir[MAXNAMELEN];
+ char *cmd;
+ char *realaltdir;
char *bin;
char *ztest;
char *isa;
int isalen;
- (void) realpath(getexecname(), cmd);
+ cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ VERIFY(NULL != realpath(getexecname(), cmd));
if (0 != access(altdir, F_OK)) {
ztest_dump_core = B_FALSE;
fatal(B_TRUE, "invalid alternate ztest path: %s",
@@ -751,6 +754,9 @@
fatal(B_TRUE, "invalid alternate lib directory %s",
zo->zo_alt_libpath);
}
+
+ umem_free(cmd, MAXPATHLEN);
+ umem_free(realaltdir, MAXPATHLEN);
}
}
@@ -767,10 +773,12 @@
{
uint64_t r;
+ ASSERT3S(ztest_fd_rand, >=, 0);
+
if (range == 0)
return (0);
- if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r))
+ if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
fatal(1, "short read from /dev/urandom");
return (r % range);
@@ -4836,7 +4844,18 @@
if (islog)
(void) rw_unlock(&ztest_name_lock);
} else {
+ /*
+ * Ideally we would like to be able to randomly
+ * call vdev_[on|off]line without holding locks
+ * to force unpredictable failures but the side
+ * effects of vdev_[on|off]line prevent us from
+ * doing so. We grab the ztest_vdev_lock here to
+ * prevent a race between injection testing and
+ * aux_vdev removal.
+ */
+ VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
(void) vdev_online(spa, guid0, 0, NULL);
+ VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
}
}
@@ -5795,29 +5814,16 @@
}
static void
-setup_fds(void)
+setup_data_fd(void)
{
- int fd;
-#ifdef illumos
+ static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
- char *tmp = tempnam(NULL, NULL);
- fd = open(tmp, O_RDWR | O_CREAT, 0700);
- ASSERT3U(fd, ==, ZTEST_FD_DATA);
- (void) unlink(tmp);
- free(tmp);
-#else
- char tmp[MAXPATHLEN];
+ ztest_fd_data = mkstemp(ztest_name_data);
+ ASSERT3S(ztest_fd_data, >=, 0);
+ (void) unlink(ztest_name_data);
+}
- strlcpy(tmp, ztest_opts.zo_dir, MAXPATHLEN);
- strlcat(tmp, "/ztest.XXXXXX", MAXPATHLEN);
- fd = mkstemp(tmp);
- ASSERT3U(fd, ==, ZTEST_FD_DATA);
-#endif
- fd = open("/dev/urandom", O_RDONLY);
- ASSERT3U(fd, ==, ZTEST_FD_RAND);
-}
-
static int
shared_data_size(ztest_shared_hdr_t *hdr)
{
@@ -5838,15 +5844,11 @@
int size;
ztest_shared_hdr_t *hdr;
-#ifndef illumos
- pwrite(ZTEST_FD_DATA, "", 1, 0);
-#endif
-
hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
- PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+ PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
- VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t)));
+ VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
@@ -5857,7 +5859,7 @@
hdr->zh_ds_count = ztest_opts.zo_datasets;
size = shared_data_size(hdr);
- VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size));
+ VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
}
@@ -5870,7 +5872,7 @@
uint8_t *buf;
hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
- PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0);
+ PROT_READ, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
size = shared_data_size(hdr);
@@ -5877,7 +5879,7 @@
(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
- PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+ PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
ASSERT(hdr != MAP_FAILED);
buf = (uint8_t *)hdr;
@@ -5896,12 +5898,13 @@
{
pid_t pid;
int status;
- char cmdbuf[MAXPATHLEN];
+ char *cmdbuf = NULL;
pid = fork();
if (cmd == NULL) {
- (void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf));
+ cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
cmd = cmdbuf;
}
@@ -5910,9 +5913,16 @@
if (pid == 0) { /* child */
char *emptyargv[2] = { cmd, NULL };
+ char fd_data_str[12];
struct rlimit rl = { 1024, 1024 };
(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+ (void) close(ztest_fd_rand);
+ VERIFY3U(11, >=,
+ snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+ VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
(void) enable_extended_FILE_stdio(-1, -1);
if (libpath != NULL)
VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
@@ -5925,6 +5935,11 @@
fatal(B_TRUE, "exec failed: %s", cmd);
}
+ if (cmdbuf != NULL) {
+ umem_free(cmdbuf, MAXPATHLEN);
+ cmd = NULL;
+ }
+
while (waitpid(pid, &status, 0) != pid)
continue;
if (statusp != NULL)
@@ -5989,25 +6004,27 @@
char timebuf[100];
char numbuf[6];
spa_t *spa;
- char cmd[MAXNAMELEN];
+ char *cmd;
boolean_t hasalt;
+ char *fd_data_str = getenv("ZTEST_FD_DATA");
- boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR));
- ASSERT(ischild || errno == EBADF);
-
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
- if (!ischild) {
+ ztest_fd_rand = open("/dev/urandom", O_RDONLY);
+ ASSERT3S(ztest_fd_rand, >=, 0);
+
+ if (!fd_data_str) {
process_options(argc, argv);
- setup_fds();
+ setup_data_fd();
setup_hdr();
setup_data();
bcopy(&ztest_opts, ztest_shared_opts,
sizeof (*ztest_shared_opts));
} else {
+ ztest_fd_data = atoi(fd_data_str);
setup_data();
bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
}
@@ -6014,14 +6031,14 @@
ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
/* Override location of zpool.cache */
- (void) asprintf((char **)&spa_config_path, "%s/zpool.cache",
- ztest_opts.zo_dir);
+ VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+ ztest_opts.zo_dir), !=, -1);
ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
UMEM_NOFAIL);
zs = ztest_shared;
- if (ischild) {
+ if (fd_data_str) {
metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
metaslab_df_alloc_threshold =
zs->zs_metaslab_df_alloc_threshold;
@@ -6044,7 +6061,8 @@
(u_longlong_t)ztest_opts.zo_time);
}
- (void) strlcpy(cmd, getexecname(), sizeof (cmd));
+ cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+ (void) strlcpy(cmd, getexecname(), MAXNAMELEN);
zs->zs_do_init = B_TRUE;
if (strlen(ztest_opts.zo_alt_ztest) != 0) {
@@ -6185,5 +6203,7 @@
kills, iters - kills, (100.0 * kills) / MAX(1, iters));
}
+ umem_free(cmd, MAXNAMELEN);
+
return (0);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 2016-09-30 01:31:53 UTC (rev 8995)
@@ -192,6 +192,7 @@
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
@@ -322,7 +323,6 @@
kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_l2_write_trylock_fail;
kstat_named_t arcstat_l2_write_passed_headroom;
kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -335,6 +335,10 @@
kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
kstat_named_t arcstat_l2_write_buffer_list_iter;
kstat_named_t arcstat_l2_write_buffer_list_null_iter;
+ kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_duplicate_buffers;
+ kstat_named_t arcstat_duplicate_buffers_size;
+ kstat_named_t arcstat_duplicate_reads;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -392,7 +396,6 @@
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
@@ -404,7 +407,11 @@
{ "l2_write_pios", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
- { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
+ { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
+ { "duplicate_reads", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -1517,6 +1524,17 @@
hdr->b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
+
+ /*
+ * This buffer already exists in the arc so create a duplicate
+ * copy for the caller. If the buffer is associated with user data
+ * then track the size and number of duplicates. These stats will be
+ * updated as duplicate buffers are created and destroyed.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMP(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+ }
hdr->b_datacnt += 1;
return (buf);
}
@@ -1617,6 +1635,16 @@
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
buf->b_data = NULL;
+
+ /*
+ * If we're destroying a duplicate buffer make sure
+ * that the appropriate statistics are updated.
+ */
+ if (buf->b_hdr->b_datacnt > 1 &&
+ buf->b_hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+ }
ASSERT(buf->b_hdr->b_datacnt > 0);
buf->b_hdr->b_datacnt -= 1;
}
@@ -1803,6 +1831,48 @@
}
/*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ boolean_t evict_needed = B_FALSE;
+
+ if (zfs_disable_dup_eviction)
+ return (B_FALSE);
+
+ mutex_enter(&buf->b_evict_lock);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts(); let that function
+ * perform the eviction.
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_FALSE);
+ } else if (buf->b_data == NULL) {
+ /*
+ * We have already been added to the arc eviction list;
+ * recommend eviction.
+ */
+ ASSERT3P(hdr, ==, &arc_eviction_hdr);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_TRUE);
+ }
+
+ if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+ evict_needed = B_TRUE;
+
+ mutex_exit(&buf->b_evict_lock);
+ return (evict_needed);
+}
+
+/*
* Evict buffers from list until we've removed the specified number of
* bytes. Move the removed buffers to the appropriate evict state.
* If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -2888,8 +2958,10 @@
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
if (acb->acb_done) {
- if (abuf == NULL)
+ if (abuf == NULL) {
+ ARCSTAT_BUMP(arcstat_duplicate_reads);
abuf = arc_buf_clone(buf);
+ }
acb->acb_buf = abuf;
abuf = NULL;
}
@@ -3408,6 +3480,16 @@
ASSERT3U(*size, >=, hdr->b_size);
atomic_add_64(size, -hdr->b_size);
}
+
+ /*
+ * We're releasing a duplicate user data buffer, update
+ * our statistics accordingly.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+ -hdr->b_size);
+ }
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
#ifdef illumos
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c 2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c 2016-09-30 01:31:53 UTC (rev 8995)
@@ -2071,7 +2071,24 @@
dbuf_evict(db);
} else {
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
- if (!DBUF_IS_CACHEABLE(db))
+
+ /*
+ * A dbuf will be eligible for eviction if either the
+ * 'primarycache' property is set or a duplicate
+ * copy of this buffer is already cached in the arc.
+ *
+ * In the case of the 'primarycache' a buffer
+ * is considered for eviction if it matches the
+ * criteria set in the property.
+ *
+ * To decide if our buffer is considered a
+ * duplicate, we must call into the arc to determine
+ * if multiple buffers are referencing the same
+ * block on-disk. If so, then we simply evict
+ * ourselves.
+ */
+ if (!DBUF_IS_CACHEABLE(db) ||
+ arc_buf_eviction_needed(db->db_buf))
dbuf_clear(db);
else
mutex_exit(&db->db_mtx);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c 2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c 2016-09-30 01:31:53 UTC (rev 8995)
@@ -6018,7 +6018,7 @@
*/
ASSERT(tx->tx_txg != TXG_INITIAL);
- ASSERT(version <= SPA_VERSION);
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
ASSERT(version >= spa_version(spa));
spa->spa_uberblock.ub_version = version;
@@ -6559,7 +6559,7 @@
* future version would result in an unopenable pool, this shouldn't be
* possible.
*/
- ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
+ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
ASSERT(version >= spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h 2016-09-30 01:31:09 UTC (rev 8994)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h 2016-09-30 01:31:53 UTC (rev 8995)
@@ -96,6 +96,7 @@
int arc_has_callback(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
void arc_buf_thaw(arc_buf_t *buf);
+boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
#ifdef ZFS_DEBUG
int arc_referenced(arc_buf_t *buf);
#endif
More information about the Midnightbsd-cvs
mailing list