[Midnightbsd-cvs] src [8987] trunk: Merge several zfs fixes from Illumos and FreeBSD.

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Thu Sep 29 21:23:04 EDT 2016


Revision: 8987
          http://svnweb.midnightbsd.org/src/?rev=8987
Author:   laffer1
Date:     2016-09-29 21:23:04 -0400 (Thu, 29 Sep 2016)
Log Message:
-----------
Merge several zfs fixes from Illumos and FreeBSD.  Sto uninitialized warnings.  Merge ZFS I/O deadman thread. This feature panics the system on a hanging zfs.  This can be controlled by vfs.zfs.deadman_enabled and vfs.zfs.deadman_synctime.  Speedup metaslab_sync.

Modified Paths:
--------------
    trunk/cddl/contrib/opensolaris/cmd/zdb/zdb.c
    trunk/cddl/contrib/opensolaris/cmd/zfs/zfs.8
    trunk/cddl/contrib/opensolaris/cmd/zinject/translate.c
    trunk/cddl/contrib/opensolaris/cmd/zinject/zinject.c
    trunk/cddl/contrib/opensolaris/cmd/zpool/zpool.8
    trunk/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
    trunk/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
    trunk/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
    trunk/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
    trunk/sys/cddl/compat/opensolaris/sys/time.h
    trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
    trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
    trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h

Modified: trunk/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zdb/zdb.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zdb/zdb.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -545,7 +545,7 @@
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	avl_tree_t *t = sm->sm_pp_root;
 	int free_pct = sm->sm_space * 100 / sm->sm_size;
 
@@ -561,7 +561,7 @@
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo;
 	char freebuf[32];
 
@@ -1189,7 +1189,7 @@
 }
 
 static void
-dump_bpobj(bpobj_t *bpo, char *name)
+dump_bpobj(bpobj_t *bpo, char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
@@ -1199,25 +1199,50 @@
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
-	if (bpo->bpo_havesubobj) {
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
-		(void) printf("\n    %s: %llu local blkptrs, %llu subobjs, "
-		    "%s (%s/%s comp)\n",
-		    name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		(void) printf("    %*s: object %llu, %llu local blkptrs, "
+		    "%llu subobjs, %s (%s/%s comp)\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 		    bytes, comp, uncomp);
+
+		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+			uint64_t subobj;
+			bpobj_t subbpo;
+			int error;
+			VERIFY0(dmu_read(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs,
+			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+			if (error != 0) {
+				(void) printf("ERROR %u while trying to open "
+				    "subobj id %llu\n",
+				    error, (u_longlong_t)subobj);
+				continue;
+			}
+			dump_bpobj(&subbpo, "subobj", indent + 1);
+			bpobj_close(&subbpo);
+		}
 	} else {
-		(void) printf("\n    %s: %llu blkptrs, %s\n",
-		    name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes);
+		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    bytes);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
-	(void) printf("\n");
 
-	(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+	if (indent == 0) {
+		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+		(void) printf("\n");
+	}
 }
 
 static void
@@ -1224,6 +1249,7 @@
 dump_deadlist(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
+	uint64_t unused;
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
@@ -1242,14 +1268,24 @@
 
 	(void) printf("\n");
 
+	/* force the tree to be loaded */
+	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
+
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
-		(void) printf("      mintxg %llu -> obj %llu\n",
-		    (longlong_t)dle->dle_mintxg,
-		    (longlong_t)dle->dle_bpobj.bpo_object);
+		if (dump_opt['d'] >= 5) {
+			char buf[128];
+			(void) snprintf(buf, sizeof (buf), "mintxg %llu -> ",
+			    (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
 
-		if (dump_opt['d'] >= 5)
-			dump_bpobj(&dle->dle_bpobj, "");
+			dump_bpobj(&dle->dle_bpobj, buf, 0);
+		} else {
+			(void) printf("mintxg %llu -> obj %llu\n",
+			    (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
+
+		}
 	}
 }
 
@@ -1272,7 +1308,7 @@
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
- * the doman-rid string.
+ * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
@@ -2160,11 +2196,11 @@
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
-				space_map_unload(&msp->ms_map);
-				VERIFY(space_map_load(&msp->ms_map,
+				space_map_unload(msp->ms_map);
+				VERIFY(space_map_load(msp->ms_map,
 				    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
 				    spa->spa_meta_objset) == 0);
-				msp->ms_map.sm_ppd = vd;
+				msp->ms_map->sm_ppd = vd;
 				mutex_exit(&msp->ms_lock);
 			}
 		}
@@ -2187,7 +2223,7 @@
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
 				mutex_enter(&msp->ms_lock);
-				space_map_unload(&msp->ms_map);
+				space_map_unload(msp->ms_map);
 				mutex_exit(&msp->ms_lock);
 			}
 		}
@@ -2529,10 +2565,11 @@
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
-			dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
+			dump_bpobj(&spa->spa_deferred_bpobj,
+			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
-				    "Pool snapshot frees");
+				    "Pool snapshot frees", 0);
 			}
 
 			if (spa_feature_is_active(spa,

Modified: trunk/cddl/contrib/opensolaris/cmd/zfs/zfs.8
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zfs/zfs.8	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zfs/zfs.8	2016-09-30 01:23:04 UTC (rev 8987)
@@ -517,6 +517,39 @@
 .Qq Nm Cm destroy -d
 command. Otherwise, the property is
 .Cm off .
+.It Sy logicalreferenced
+The amount of space that is
+.Qq logically
+accessible by this dataset.
+See the
+.Sy referenced
+property.
+The logical space ignores the effect of the
+.Sy compression
+and
+.Sy copies
+properties, giving a quantity closer to the amount of data that applications
+see.
+However, it does include space consumed by metadata.
+.Pp
+This property can also be referred to by its shortened column name,
+.Sy lrefer .
+.It Sy logicalused
+The amount of space that is
+.Qq logically
+consumed by this dataset and all its descendents.
+See the
+.Sy used
+property.
+The logical space ignores the effect of the
+.Sy compression
+and
+.Sy copies
+properties, giving a quantity closer to the amount of data that applications
+see.
+.Pp
+This property can also be referred to by its shortened column name,
+.Sy lused .
 .It Sy mounted
 For file systems, indicates whether the file system is currently mounted. This
 property can be either

Modified: trunk/cddl/contrib/opensolaris/cmd/zinject/translate.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zinject/translate.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zinject/translate.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <libzfs.h>
@@ -455,6 +456,20 @@
 		    &record->zi_guid) == 0);
 	}
 
+	/*
+	 * Device faults can take on three different forms:
+	 * 1). delayed or hanging I/O
+	 * 2). zfs label faults
+	 * 3). generic disk faults
+	 */
+	if (record->zi_timer != 0) {
+		record->zi_cmd = ZINJECT_DELAY_IO;
+	} else if (label_type != TYPE_INVAL) {
+		record->zi_cmd = ZINJECT_LABEL_FAULT;
+	} else {
+		record->zi_cmd = ZINJECT_DEVICE_FAULT;
+	}
+
 	switch (label_type) {
 	case TYPE_LABEL_UBERBLOCK:
 		record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);

Modified: trunk/cddl/contrib/opensolaris/cmd/zinject/zinject.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zinject/zinject.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zinject/zinject.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -603,7 +604,7 @@
 	}
 
 	while ((c = getopt(argc, argv,
-	    ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+	    ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
@@ -629,6 +630,15 @@
 		case 'd':
 			device = optarg;
 			break;
+		case 'D':
+			record.zi_timer = strtoull(optarg, &end, 10);
+			if (errno != 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid i/o delay "
+				    "value: '%s'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 'e':
 			if (strcasecmp(optarg, "io") == 0) {
 				error = EIO;
@@ -693,6 +703,7 @@
 		case 'p':
 			(void) strlcpy(record.zi_func, optarg,
 			    sizeof (record.zi_func));
+			record.zi_cmd = ZINJECT_PANIC;
 			break;
 		case 'q':
 			quiet = 1;
@@ -766,13 +777,15 @@
 	argc -= optind;
 	argv += optind;
 
+	if (record.zi_duration != 0)
+		record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
 	if (cancel != NULL) {
 		/*
 		 * '-c' is invalid with any other options.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0' ||
-		    record.zi_duration != 0) {
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "cancel (-c) incompatible with "
 			    "any other options\n");
 			usage();
@@ -804,8 +817,7 @@
 		 * for doing injection, so handle it separately here.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || record.zi_func[0] != '\0' ||
-		    record.zi_duration != 0) {
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "device (-d) incompatible with "
 			    "data error injection\n");
 			usage();
@@ -839,7 +851,7 @@
 
 	} else if (raw != NULL) {
 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
-		    record.zi_func[0] != '\0' || record.zi_duration != 0) {
+		    record.zi_cmd != ZINJECT_UNINITIALIZED) {
 			(void) fprintf(stderr, "raw (-b) format with "
 			    "any other options\n");
 			usage();
@@ -862,13 +874,14 @@
 			return (1);
 		}
 
+		record.zi_cmd = ZINJECT_DATA_FAULT;
 		if (translate_raw(raw, &record) != 0)
 			return (1);
 		if (!error)
 			error = EIO;
-	} else if (record.zi_func[0] != '\0') {
+	} else if (record.zi_cmd == ZINJECT_PANIC) {
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0 || device != NULL || record.zi_duration != 0) {
+		    level != 0 || device != NULL) {
 			(void) fprintf(stderr, "panic (-p) incompatible with "
 			    "other options\n");
 			usage();
@@ -886,7 +899,7 @@
 		if (argv[1] != NULL)
 			record.zi_type = atoi(argv[1]);
 		dataset[0] = '\0';
-	} else if (record.zi_duration != 0) {
+	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
 		if (nowrites == 0) {
 			(void) fprintf(stderr, "-s or -g meaningless "
 			    "without -I (ignore writes)\n");
@@ -940,6 +953,7 @@
 			return (1);
 		}
 
+		record.zi_cmd = ZINJECT_DATA_FAULT;
 		if (translate_record(type, argv[0], range, level, &record, pool,
 		    dataset) != 0)
 			return (1);

Modified: trunk/cddl/contrib/opensolaris/cmd/zpool/zpool.8
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zpool/zpool.8	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zpool/zpool.8	2016-09-30 01:23:04 UTC (rev 8987)
@@ -25,7 +25,7 @@
 .\"
 .\" $MidnightBSD$
 .\"
-.Dd November 15, 2012
+.Dd March 14, 2013
 .Dt ZPOOL 8
 .Os
 .Sh NAME
@@ -1608,8 +1608,8 @@
 .Ar count
 reports are printed.
 .Pp
-If a scrub or resilver is in progress, this command reports the percentage done
-and the estimated time to completion. Both of these are only approximate,
+If a scrub or resilver is in progress, this command reports the percentage
+done and the estimated time to completion. Both of these are only approximate,
 because the amount of data in the pool and the other workloads on the system
 can change.
 .Bl -tag -width indent
@@ -1616,6 +1616,7 @@
 .It Fl x
 Only display status for pools that are exhibiting errors or are otherwise
 unavailable.
+Warnings about pools not using the latest on-disk format will not be included.
 .It Fl v
 Displays verbose data error information, printing out a complete list of all
 data errors since the last complete pool scrub.

Modified: trunk/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
===================================================================
--- trunk/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -4030,7 +4030,10 @@
 	 * If we were given 'zpool status -x', only report those pools with
 	 * problems.
 	 */
-	if (reason == ZPOOL_STATUS_OK && cbp->cb_explain) {
+	if (cbp->cb_explain &&
+	    (reason == ZPOOL_STATUS_OK ||
+	    reason == ZPOOL_STATUS_VERSION_OLDER ||
+	    reason == ZPOOL_STATUS_FEAT_DISABLED)) {
 		if (!cbp->cb_allpools) {
 			(void) printf(gettext("pool '%s' is healthy\n"),
 			    zpool_get_name(zhp));

Modified: trunk/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
===================================================================
--- trunk/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -24,6 +24,7 @@
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel at dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  */
 
 #ifndef	_LIBFS_IMPL_H
@@ -216,6 +217,7 @@
 
 #ifndef sun
 static int zfs_kernel_version = 0;
+static int zfs_ioctl_version = 0;
 
 /*
  * This is FreeBSD version of ioctl, because Solaris' ioctl() updates
@@ -225,20 +227,35 @@
 static __inline int
 zcmd_ioctl(int fd, unsigned long cmd, zfs_cmd_t *zc)
 {
-	size_t oldsize, zfs_kernel_version_size;
+	size_t oldsize, zfs_kernel_version_size, zfs_ioctl_version_size;
 	int version, ret, cflag = ZFS_CMD_COMPAT_NONE;
 
-	zfs_kernel_version_size = sizeof(zfs_kernel_version);
-	if (zfs_kernel_version == 0) {
-		sysctlbyname("vfs.zfs.version.spa", &zfs_kernel_version,
-		    &zfs_kernel_version_size, NULL, 0);
+	zfs_ioctl_version_size = sizeof(zfs_ioctl_version);
+	if (zfs_ioctl_version == 0) {
+		sysctlbyname("vfs.zfs.version.ioctl", &zfs_ioctl_version,
+		    &zfs_ioctl_version_size, NULL, 0);
 	}
 
-	if (zfs_kernel_version == SPA_VERSION_15 ||
-	    zfs_kernel_version == SPA_VERSION_14 ||
-	    zfs_kernel_version == SPA_VERSION_13)
-		cflag = ZFS_CMD_COMPAT_V15;
+	/*
+	 * If vfs.zfs.version.ioctl is not defined, assume we have v28
+	 * compatible binaries and use vfs.zfs.version.spa to test for v15
+	 */
+	if (zfs_ioctl_version < ZFS_IOCVER_DEADMAN) {
+		cflag = ZFS_CMD_COMPAT_V28;
+		zfs_kernel_version_size = sizeof(zfs_kernel_version);
 
+		if (zfs_kernel_version == 0) {
+			sysctlbyname("vfs.zfs.version.spa",
+			    &zfs_kernel_version,
+			    &zfs_kernel_version_size, NULL, 0);
+		}
+
+		if (zfs_kernel_version == SPA_VERSION_15 ||
+		    zfs_kernel_version == SPA_VERSION_14 ||
+		    zfs_kernel_version == SPA_VERSION_13)
+			cflag = ZFS_CMD_COMPAT_V15;
+	}
+
 	oldsize = zc->zc_nvlist_dst_size;
 	ret = zcmd_ioctl_compat(fd, cmd, zc, cflag);
 

Modified: trunk/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
===================================================================
--- trunk/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -45,6 +45,9 @@
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
 char hw_serial[HW_HOSTID_LEN];
+#ifdef illumos
+kmutex_t cpu_lock;
+#endif
 
 struct utsname utsname = {
 	"userland", "libzpool", "1", "1", "na"
@@ -842,6 +845,28 @@
 	return (0);
 }
 
+#ifdef illumos
+/* ARGSUSED */
+cyclic_id_t
+cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
+{
+	return (1);
+}
+
+/* ARGSUSED */
+void
+cyclic_remove(cyclic_id_t id)
+{
+}
+
+/* ARGSUSED */
+int
+cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
+{
+	return (1);
+}
+#endif
+
 /*
  * =========================================================================
  * kernel emulation setup & teardown
@@ -875,6 +900,10 @@
 
 	system_taskq_init();
 
+#ifdef illumos
+	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
+#endif
+
 	spa_init(mode);
 }
 

Modified: trunk/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
===================================================================
--- trunk/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -465,6 +465,9 @@
 
 extern void delay(clock_t ticks);
 
+#define	SEC_TO_TICK(sec)	((sec) * hz)
+#define	NSEC_TO_TICK(usec)	((usec) / (NANOSEC / hz))
+
 #define	gethrestime_sec() time(NULL)
 #define	gethrestime(t) \
 	do {\
@@ -632,6 +635,36 @@
 #define	ERESTART	(-1)
 #endif
 
+#ifdef illumos
+/*
+ * Cyclic information
+ */
+extern kmutex_t cpu_lock;
+
+typedef uintptr_t cyclic_id_t;
+typedef uint16_t cyc_level_t;
+typedef void (*cyc_func_t)(void *);
+
+#define	CY_LOW_LEVEL	0
+#define	CY_INFINITY	INT64_MAX
+#define	CYCLIC_NONE	((cyclic_id_t)0)
+
+typedef struct cyc_time {
+	hrtime_t cyt_when;
+	hrtime_t cyt_interval;
+} cyc_time_t;
+
+typedef struct cyc_handler {
+	cyc_func_t cyh_func;
+	void *cyh_arg;
+	cyc_level_t cyh_level;
+} cyc_handler_t;
+
+extern cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *);
+extern void cyclic_remove(cyclic_id_t);
+extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
+#endif	/* illumos */
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/compat/opensolaris/sys/time.h
===================================================================
--- trunk/sys/cddl/compat/opensolaris/sys/time.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/compat/opensolaris/sys/time.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -47,6 +47,9 @@
 	((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX)
 #endif
 
+#define	SEC_TO_TICK(sec)	((sec) * hz)
+#define	NSEC_TO_TICK(usec)	((usec) / (NANOSEC / hz))
+
 #ifdef _KERNEL
 static __inline hrtime_t
 gethrtime(void) {

Modified: trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  * Portions Copyright 2005, 2010, Oracle and/or its affiliates.
  * All rights reserved.
  * Use is subject to license terms.
@@ -35,8 +35,13 @@
 #include <sys/zfs_ioctl.h>
 #include "zfs_ioctl_compat.h"
 
+static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+    0, "ZFS_IOCTL_VERSION");
+
 /*
- * FreeBSD zfs_cmd compatibility with v15 and older binaries
+ * FreeBSD zfs_cmd compatibility with older binaries
  * appropriately remap/extend the zfs_cmd_t structure
  */
 void
@@ -43,14 +48,87 @@
 zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
 {
 	zfs_cmd_v15_t *zc_c;
+	zfs_cmd_v28_t *zc28_c;
 
-	if (cflag == ZFS_CMD_COMPAT_V15) {
+	switch (cflag) {
+	case ZFS_CMD_COMPAT_V28:
+		zc28_c = (void *)addr;
+
+		/* zc */
+		strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN);
+		strlcpy(zc->zc_top_ds, zc28_c->zc_top_ds, MAXPATHLEN);
+		zc->zc_guid = zc28_c->zc_guid;
+		zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf;
+		zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size;
+		zc->zc_nvlist_src = zc28_c->zc_nvlist_src;
+		zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size;
+		zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst;
+		zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size;
+		zc->zc_cookie = zc28_c->zc_cookie;
+		zc->zc_objset_type = zc28_c->zc_objset_type;
+		zc->zc_perm_action = zc28_c->zc_perm_action;
+		zc->zc_history = zc28_c->zc_history;
+		zc->zc_history_len = zc28_c->zc_history_len;
+		zc->zc_history_offset = zc28_c->zc_history_offset;
+		zc->zc_obj = zc28_c->zc_obj;
+		zc->zc_iflags = zc28_c->zc_iflags;
+		zc->zc_share = zc28_c->zc_share;
+		zc->zc_jailid = zc28_c->zc_jailid;
+		zc->zc_objset_stats = zc28_c->zc_objset_stats;
+		zc->zc_begin_record = zc28_c->zc_begin_record;
+		zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
+		zc->zc_temphold = zc28_c->zc_temphold;
+		zc->zc_action_handle = zc28_c->zc_action_handle;
+		zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
+		zc->zc_simple = zc28_c->zc_simple;
+		bcopy(zc28_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad));
+		zc->zc_sendobj = zc28_c->zc_sendobj;
+		zc->zc_fromobj = zc28_c->zc_fromobj;
+		zc->zc_createtxg = zc28_c->zc_createtxg;
+		zc->zc_stat = zc28_c->zc_stat;
+
+		/* zc->zc_inject_record */
+		zc->zc_inject_record.zi_objset =
+		    zc28_c->zc_inject_record.zi_objset;
+		zc->zc_inject_record.zi_object =
+		    zc28_c->zc_inject_record.zi_object;
+		zc->zc_inject_record.zi_start =
+		    zc28_c->zc_inject_record.zi_start;
+		zc->zc_inject_record.zi_end =
+		    zc28_c->zc_inject_record.zi_end;
+		zc->zc_inject_record.zi_guid =
+		    zc28_c->zc_inject_record.zi_guid;
+		zc->zc_inject_record.zi_level =
+		    zc28_c->zc_inject_record.zi_level;
+		zc->zc_inject_record.zi_error =
+		    zc28_c->zc_inject_record.zi_error;
+		zc->zc_inject_record.zi_type =
+		    zc28_c->zc_inject_record.zi_type;
+		zc->zc_inject_record.zi_freq =
+		    zc28_c->zc_inject_record.zi_freq;
+		zc->zc_inject_record.zi_failfast =
+		    zc28_c->zc_inject_record.zi_failfast;
+		strlcpy(zc->zc_inject_record.zi_func,
+		    zc28_c->zc_inject_record.zi_func, MAXNAMELEN);
+		zc->zc_inject_record.zi_iotype =
+		    zc28_c->zc_inject_record.zi_iotype;
+		zc->zc_inject_record.zi_duration =
+		    zc28_c->zc_inject_record.zi_duration;
+		zc->zc_inject_record.zi_timer =
+		    zc28_c->zc_inject_record.zi_timer;
+		zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED;
+		zc->zc_inject_record.zi_pad = 0;
+		break;
+
+	case ZFS_CMD_COMPAT_V15:
 		zc_c = (void *)addr;
 
 		/* zc */
-		strlcpy(zc->zc_name,zc_c->zc_name,MAXPATHLEN);
-		strlcpy(zc->zc_value,zc_c->zc_value,MAXPATHLEN);
-		strlcpy(zc->zc_string,zc_c->zc_string,MAXPATHLEN);
+		strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN);
+		strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN);
 		zc->zc_guid = zc_c->zc_guid;
 		zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
 		zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
@@ -91,6 +169,7 @@
 		    zc_c->zc_inject_record.zi_freq;
 		zc->zc_inject_record.zi_failfast =
 		    zc_c->zc_inject_record.zi_failfast;
+		break;
 	}
 }
 
@@ -98,15 +177,84 @@
 zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int cflag)
 {
 	zfs_cmd_v15_t *zc_c;
+	zfs_cmd_v28_t *zc28_c;
 
 	switch (cflag) {
+	case ZFS_CMD_COMPAT_V28:
+		zc28_c = (void *)addr;
+
+		strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN);
+		strlcpy(zc28_c->zc_top_ds, zc->zc_top_ds, MAXPATHLEN);
+		zc28_c->zc_guid = zc->zc_guid;
+		zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+		zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+		zc28_c->zc_nvlist_src = zc->zc_nvlist_src;
+		zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+		zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+		zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+		zc28_c->zc_cookie = zc->zc_cookie;
+		zc28_c->zc_objset_type = zc->zc_objset_type;
+		zc28_c->zc_perm_action = zc->zc_perm_action;
+		zc28_c->zc_history = zc->zc_history;
+		zc28_c->zc_history_len = zc->zc_history_len;
+		zc28_c->zc_history_offset = zc->zc_history_offset;
+		zc28_c->zc_obj = zc->zc_obj;
+		zc28_c->zc_iflags = zc->zc_iflags;
+		zc28_c->zc_share = zc->zc_share;
+		zc28_c->zc_jailid = zc->zc_jailid;
+		zc28_c->zc_objset_stats = zc->zc_objset_stats;
+		zc28_c->zc_begin_record = zc->zc_begin_record;
+		zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
+		zc28_c->zc_temphold = zc->zc_temphold;
+		zc28_c->zc_action_handle = zc->zc_action_handle;
+		zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
+		zc28_c->zc_simple = zc->zc_simple;
+		bcopy(zc->zc_pad, zc28_c->zc_pad, sizeof(zc28_c->zc_pad));
+		zc28_c->zc_sendobj = zc->zc_sendobj;
+		zc28_c->zc_fromobj = zc->zc_fromobj;
+		zc28_c->zc_createtxg = zc->zc_createtxg;
+		zc28_c->zc_stat = zc->zc_stat;
+
+		/* zc_inject_record */
+		zc28_c->zc_inject_record.zi_objset =
+		    zc->zc_inject_record.zi_objset;
+		zc28_c->zc_inject_record.zi_object =
+		    zc->zc_inject_record.zi_object;
+		zc28_c->zc_inject_record.zi_start =
+		    zc->zc_inject_record.zi_start;
+		zc28_c->zc_inject_record.zi_end =
+		    zc->zc_inject_record.zi_end;
+		zc28_c->zc_inject_record.zi_guid =
+		    zc->zc_inject_record.zi_guid;
+		zc28_c->zc_inject_record.zi_level =
+		    zc->zc_inject_record.zi_level;
+		zc28_c->zc_inject_record.zi_error =
+		    zc->zc_inject_record.zi_error;
+		zc28_c->zc_inject_record.zi_type =
+		    zc->zc_inject_record.zi_type;
+		zc28_c->zc_inject_record.zi_freq =
+		    zc->zc_inject_record.zi_freq;
+		zc28_c->zc_inject_record.zi_failfast =
+		    zc->zc_inject_record.zi_failfast;
+		strlcpy(zc28_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		zc28_c->zc_inject_record.zi_iotype =
+		    zc->zc_inject_record.zi_iotype;
+		zc28_c->zc_inject_record.zi_duration =
+		    zc->zc_inject_record.zi_duration;
+		zc28_c->zc_inject_record.zi_timer =
+		    zc->zc_inject_record.zi_timer;
+		break;
+
 	case ZFS_CMD_COMPAT_V15:
 		zc_c = (void *)addr;
 
 		/* zc */
-		strlcpy(zc_c->zc_name,zc->zc_name,MAXPATHLEN);
-		strlcpy(zc_c->zc_value,zc->zc_value,MAXPATHLEN);
-		strlcpy(zc_c->zc_string,zc->zc_string,MAXPATHLEN);
+		strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN);
+		strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN);
 		zc_c->zc_guid = zc->zc_guid;
 		zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
 		zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
@@ -260,7 +408,7 @@
 }
 
 static int
-zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int cflag)
+zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc)
 {
 	nvlist_t *nv, *nvp = NULL;
 	nvpair_t *elem;
@@ -270,7 +418,7 @@
 	    zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
 		return (error);
 
-	if (cflag == 5) { /* ZFS_IOC_POOL_STATS */
+	if (nc == 5) { /* ZFS_IOC_POOL_STATS */
 		elem = NULL;
 		while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
 			if (nvpair_value_nvlist(elem, &nvp) == 0)
@@ -334,17 +482,22 @@
 	void *zc_c;
 	unsigned long ncmd;
 
-	if (cflag == ZFS_CMD_COMPAT_NONE) {
+	switch (cflag) {
+	case ZFS_CMD_COMPAT_NONE:
 		ret = ioctl(fd, cmd, zc);
 		return (ret);
-	}
-
-	if (cflag == ZFS_CMD_COMPAT_V15) {
+	case ZFS_CMD_COMPAT_V28:
+		zc_c = malloc(sizeof(zfs_cmd_v28_t));
+		ncmd = _IOWR('Z', ZFS_IOC(cmd), struct zfs_cmd_v28);
+		break;
+	case ZFS_CMD_COMPAT_V15:
 		nc = zfs_ioctl_v28_to_v15[ZFS_IOC(cmd)];
 		zc_c = malloc(sizeof(zfs_cmd_v15_t));
 		ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
-	} else
+		break;
+	default:
 		return (EINVAL);
+	}
 
 	if (ZFS_IOC(ncmd) == ZFS_IOC_COMPAT_FAIL)
 		return (ENOTSUP);
@@ -358,16 +511,18 @@
 	zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
 	free(zc_c);
 
-	switch (nc) {
-	case 2:	/* ZFS_IOC_POOL_IMPORT */
-	case 4: /* ZFS_IOC_POOL_CONFIGS */
-	case 5: /* ZFS_IOC_POOL_STATS */
-	case 6: /* ZFS_IOC_POOL_TRYIMPORT */
-		zfs_ioctl_compat_fix_stats(zc, nc);
-		break;
-	case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
-		zfs_ioctl_compat_pool_get_props(zc);
-		break;
+	if (cflag == ZFS_CMD_COMPAT_V15) {
+		switch (nc) {
+		case 2:	/* ZFS_IOC_POOL_IMPORT */
+		case 4: /* ZFS_IOC_POOL_CONFIGS */
+		case 5: /* ZFS_IOC_POOL_STATS */
+		case 6: /* ZFS_IOC_POOL_TRYIMPORT */
+			zfs_ioctl_compat_fix_stats(zc, nc);
+			break;
+		case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+			zfs_ioctl_compat_pool_get_props(zc);
+			break;
+		}
 	}
 
 	return (ret);

Modified: trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Martin Matuska <mm at FreeBSD.org>.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,11 +40,21 @@
 extern "C" {
 #endif
 
-#define ZFS_CMD_COMPAT_NONE	0
+/*
+ * Backwards ioctl compatibility
+ */
+
+/* ioctl versions for vfs.zfs.version.ioctl */
+#define	ZFS_IOCVER_DEADMAN	1
+#define	ZFS_IOCVER_CURRENT	ZFS_IOCVER_DEADMAN
+
+/* compatibility conversion flag */
+#define	ZFS_CMD_COMPAT_NONE	0
 #define	ZFS_CMD_COMPAT_V15	1
+#define	ZFS_CMD_COMPAT_V28	2
 
-#define ZFS_IOC_COMPAT_PASS	254
-#define ZFS_IOC_COMPAT_FAIL	255
+#define	ZFS_IOC_COMPAT_PASS	254
+#define	ZFS_IOC_COMPAT_FAIL	255
 
 typedef struct zinject_record_v15 {
 	uint64_t	zi_objset;
@@ -84,6 +94,60 @@
 	zinject_record_v15_t zc_inject_record;
 } zfs_cmd_v15_t;
 
+typedef struct zinject_record_v28 {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+	uint32_t	zi_failfast;
+	char		zi_func[MAXNAMELEN];
+	uint32_t	zi_iotype;
+	int32_t		zi_duration;
+	uint64_t	zi_timer;
+} zinject_record_v28_t;
+
+typedef struct zfs_cmd_v28 {
+	char		zc_name[MAXPATHLEN];
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	char		zc_top_ds[MAXPATHLEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t 	zc_history;		/* really (char *) */
+	uint64_t 	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	zinject_record_v28_t zc_inject_record;
+	boolean_t	zc_defer_destroy;
+	boolean_t	zc_temphold;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad[3];		/* alignment */
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_v28_t;
+
 #ifdef _KERNEL
 unsigned static long zfs_ioctl_v15_to_v28[] = {
 	0,	/*  0 ZFS_IOC_POOL_CREATE */

Modified: trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -350,6 +350,10 @@
 	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
 	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "<size>", "WRITTEN");
+	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LUSED");
+	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
+	    0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
 
 	/* default number properties */
 	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -2976,7 +2976,7 @@
     const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
+	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
@@ -3058,7 +3058,7 @@
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
 		vdev_t *vd = NULL;
-		uint64_t addr;
+		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 
 		if (hdr == NULL) {
@@ -3176,6 +3176,10 @@
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
+				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+				    addr + size < vd->vdev_psize -
+				    VDEV_LABEL_END_SIZE);
+
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
@@ -3371,8 +3375,8 @@
 	if (l2hdr) {
 		mutex_enter(&l2arc_buflist_mtx);
 		hdr->b_l2hdr = NULL;
-		buf_size = hdr->b_size;
 	}
+	buf_size = hdr->b_size;
 
 	/*
 	 * Do we have more than one buf?
@@ -4469,7 +4473,7 @@
 static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
-	list_t *list;
+	list_t *list = NULL;
 	int idx;
 
 	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <sys/bpobj.h>
@@ -414,6 +414,12 @@
 
 			VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
 			    0, FTAG, &subdb, 0));
+			/*
+			 * Make sure that we are not asking dmu_write()
+			 * to write more data than we have in our buffer.
+			 */
+			VERIFY3U(subdb->db_size, >=,
+			    numsubsub * sizeof (subobj));
 			dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 			    numsubsub * sizeof (subobj), subdb->db_data, tx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -408,8 +408,7 @@
 
 	if (dn->dn_objset->os_dsl_dataset)
 		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-	if (dp && dsl_pool_sync_context(dp))
-		start = gethrtime();
+	start = gethrtime();
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
@@ -1712,7 +1711,7 @@
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
-	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -1323,7 +1323,8 @@
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
-	uint64_t *user, *group;
+	uint64_t *user = NULL;
+	uint64_t *group = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -382,7 +382,7 @@
 
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
-		dsl_dataset_t *winner;
+		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
@@ -467,11 +467,8 @@
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
-		if (err == 0) {
-			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-			    dsl_dataset_evict);
-		}
-		if (err || winner) {
+		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
+		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
@@ -2347,6 +2344,8 @@
 	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
 	    ds->ds_phys->ds_compressed_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+	    ds->ds_phys->ds_uncompressed_bytes);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -541,6 +541,8 @@
 	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
 	    dd->dd_phys->dd_compressed_bytes));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
+	    dd->dd_phys->dd_uncompressed_bytes);
 	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -1658,7 +1658,8 @@
 		zio_priority = ZIO_PRIORITY_SCRUB;
 		needs_io = B_TRUE;
 		scan_delay = zfs_scrub_delay;
-	} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+	} else {
+		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		zio_priority = ZIO_PRIORITY_RESILVER;
 		needs_io = B_FALSE;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -38,6 +38,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
+#include <sys/param.h>
 
 #define	MATCH_BITS	6
 #define	MATCH_MIN	3
@@ -51,7 +52,8 @@
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
-	uchar_t *cpy, *copymap;
+	uchar_t *cpy;
+	uchar_t *copymap = NULL;
 	int copymask = 1 << (NBBY - 1);
 	int mlen, offset, hash;
 	uint16_t *hp;
@@ -100,7 +102,8 @@
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
 	uchar_t *d_end = (uchar_t *)d_start + d_len;
-	uchar_t *cpy, copymap;
+	uchar_t *cpy;
+	uchar_t copymap = 0;
 	int copymask = 1 << (NBBY - 1);
 
 	while (dst < d_end) {

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -48,6 +48,14 @@
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space_map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+
+/*
  * This value defines the number of allowed allocation failures per vdev.
  * If a device reaches this threshold in a given txg then we consider skipping
  * allocations on that device.
@@ -216,9 +224,9 @@
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
-	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+	if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 		return (-1);
-	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+	if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
@@ -733,7 +741,8 @@
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	space_map_create(&msp->ms_map, start, size,
+	msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
+	space_map_create(msp->ms_map, start, size,
 	    vd->vdev_ashift, &msp->ms_lock);
 
 	metaslab_group_add(mg, msp);
@@ -740,7 +749,7 @@
 
 	if (metaslab_debug && smo->smo_object != 0) {
 		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+		VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 		mutex_exit(&msp->ms_lock);
 	}
@@ -768,22 +777,27 @@
 	metaslab_group_t *mg = msp->ms_group;
 
 	vdev_space_update(mg->mg_vd,
-	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
+	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_unload(&msp->ms_map);
-	space_map_destroy(&msp->ms_map);
+	space_map_unload(msp->ms_map);
+	space_map_destroy(msp->ms_map);
+	kmem_free(msp->ms_map, sizeof (*msp->ms_map));
 
 	for (int t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(&msp->ms_allocmap[t]);
-		space_map_destroy(&msp->ms_freemap[t]);
+		space_map_destroy(msp->ms_allocmap[t]);
+		space_map_destroy(msp->ms_freemap[t]);
+		kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
+		kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
 	}
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		space_map_destroy(&msp->ms_defermap[t]);
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		space_map_destroy(msp->ms_defermap[t]);
+		kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+	}
 
 	ASSERT0(msp->ms_deferspace);
 
@@ -802,7 +816,7 @@
 metaslab_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
@@ -810,6 +824,16 @@
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
+	 * This vdev is in the process of being removed so there is nothing
+	 * for us to do here.
+	 */
+	if (vd->vdev_removing) {
+		ASSERT0(smo->smo_alloc);
+		ASSERT0(vd->vdev_ms_shift);
+		return (0);
+	}
+
+	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = sm->sm_size - smo->smo_alloc;
@@ -862,7 +886,7 @@
 	 * Prefetch the next potential metaslabs
 	 */
 	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
-		space_map_t *sm = &msp->ms_map;
+		space_map_t *sm = msp->ms_map;
 		space_map_obj_t *smo = &msp->ms_smo;
 
 		/* If we have reached our prefetch limit then we're done */
@@ -883,7 +907,7 @@
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -900,7 +924,7 @@
 				return (error);
 			}
 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
-				space_map_walk(&msp->ms_defermap[t],
+				space_map_walk(msp->ms_defermap[t],
 				    space_map_claim, sm);
 
 		}
@@ -931,12 +955,158 @@
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
+ * Determine if the in-core space map representation can be condensed on-disk.
+ * We would like to use the following criteria to make our decision:
+ *
+ * 1. The size of the space map object should not dramatically increase as a
+ * result of writing out our in-core free map.
+ *
+ * 2. The minimal on-disk space map representation is zfs_condense_pct/100
+ * times the size than the in-core representation (i.e. zfs_condense_pct = 110
+ * and in-core = 1MB, minimal = 1.1.MB).
+ *
+ * Checking the first condition is tricky since we don't want to walk
+ * the entire AVL tree calculating the estimated on-disk size. Instead we
+ * use the size-ordered AVL tree in the space map and calculate the
+ * size required for the largest segment in our in-core free map. If the
+ * size required to represent that segment on disk is larger than the space
+ * map object then we avoid condensing this map.
+ *
+ * To determine the second criterion we use a best-case estimate and assume
+ * each segment can be represented on-disk as a single 64-bit entry. We refer
+ * to this best-case estimate as the space map's minimal form.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo_syncing;
+	space_seg_t *ss;
+	uint64_t size, entries, segsz;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(sm->sm_loaded);
+
+	/*
+	 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
+	 * the largest segment in the in-core free map. If the tree is
+	 * empty then we should condense the map.
+	 */
+	ss = avl_last(sm->sm_pp_root);
+	if (ss == NULL)
+		return (B_TRUE);
+
+	/*
+	 * Calculate the number of 64-bit entries this segment would
+	 * require when written to disk. If this single segment would be
+	 * larger on-disk than the entire current on-disk structure, then
+	 * clearly condensing will increase the on-disk structure size.
+	 */
+	size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
+	entries = size / (MIN(size, SM_RUN_MAX));
+	segsz = entries * sizeof (uint64_t);
+
+	return (segsz <= smo->smo_objsize &&
+	    smo->smo_objsize >= (zfs_condense_pct *
+	    sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed by
+ * the in-core free map.
+ */
+static void
+metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
+	space_map_t condense_map;
+	space_map_t *sm = msp->ms_map;
+	objset_t *mos = spa_meta_objset(spa);
+	space_map_obj_t *smo = &msp->ms_smo_syncing;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(sm->sm_loaded);
+
+	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
+	    "smo size %llu, segments %lu", txg,
+	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+	    smo->smo_objsize, avl_numnodes(&sm->sm_root));
+
+	/*
+	 * Create an map that is a 100% allocated map. We remove segments
+	 * that have been freed in this txg, any deferred frees that exist,
+	 * and any allocation in the future. Removing segments should be
+	 * a relatively inexpensive operation since we expect these maps to
+	 * a small number of nodes.
+	 */
+	space_map_create(&condense_map, sm->sm_start, sm->sm_size,
+	    sm->sm_shift, sm->sm_lock);
+	space_map_add(&condense_map, condense_map.sm_start,
+	    condense_map.sm_size);
+
+	/*
+	 * Remove what's been freed in this txg from the condense_map.
+	 * Since we're in sync_pass 1, we know that all the frees from
+	 * this txg are in the freemap.
+	 */
+	space_map_walk(freemap, space_map_remove, &condense_map);
+
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		space_map_walk(msp->ms_defermap[t],
+		    space_map_remove, &condense_map);
+
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
+		space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
+		    space_map_remove, &condense_map);
+
+	/*
+	 * We're about to drop the metaslab's lock thus allowing
+	 * other consumers to change it's content. Set the
+	 * space_map's sm_condensing flag to ensure that
+	 * allocations on this metaslab do not occur while we're
+	 * in the middle of committing it to disk. This is only critical
+	 * for the ms_map as all other space_maps use per txg
+	 * views of their content.
+	 */
+	sm->sm_condensing = B_TRUE;
+
+	mutex_exit(&msp->ms_lock);
+	space_map_truncate(smo, mos, tx);
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * While we would ideally like to create a space_map representation
+	 * that consists only of allocation records, doing so can be
+	 * prohibitively expensive because the in-core free map can be
+	 * large, and therefore computationally expensive to subtract
+	 * from the condense_map. Instead we sync out two maps, a cheap
+	 * allocation only map followed by the in-core free map. While not
+	 * optimal, this is typically close to optimal, and much cheaper to
+	 * compute.
+	 */
+	space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
+	space_map_vacate(&condense_map, NULL, NULL);
+	space_map_destroy(&condense_map);
+
+	space_map_sync(sm, SM_FREE, smo, mos, tx);
+	sm->sm_condensing = B_FALSE;
+
+	spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
+	    "smo size %llu", txg,
+	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+	    smo->smo_objsize);
+}
+
+/*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
@@ -945,10 +1115,10 @@
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
+	space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
+	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t *sm = msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo_syncing;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
@@ -955,9 +1125,21 @@
 
 	ASSERT(!vd->vdev_ishole);
 
-	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+	/*
+	 * This metaslab has just been added so there's no work to do now.
+	 */
+	if (*freemap == NULL) {
+		ASSERT3P(allocmap, ==, NULL);
 		return;
+	}
 
+	ASSERT3P(allocmap, !=, NULL);
+	ASSERT3P(*freemap, !=, NULL);
+	ASSERT3P(*freed_map, !=, NULL);
+
+	if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
+		return;
+
 	/*
 	 * The only state that can actually be changing concurrently with
 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
@@ -983,49 +1165,36 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_walk(freemap, space_map_add, freed_map);
+	if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
+	    metaslab_should_condense(msp)) {
+		metaslab_condense(msp, txg, tx);
+	} else {
+		space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+		space_map_sync(*freemap, SM_FREE, smo, mos, tx);
+	}
 
-	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
-	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
-		/*
-		 * The in-core space map representation is twice as compact
-		 * as the on-disk one, so it's time to condense the latter
-		 * by generating a pure allocmap from first principles.
-		 *
-		 * This metaslab is 100% allocated,
-		 * minus the content of the in-core map (sm),
-		 * minus what's been freed this txg (freed_map),
-		 * minus deferred frees (ms_defermap[]),
-		 * minus allocations from txgs in the future
-		 * (because they haven't been committed yet).
-		 */
-		space_map_vacate(allocmap, NULL, NULL);
-		space_map_vacate(freemap, NULL, NULL);
+	space_map_vacate(allocmap, NULL, NULL);
 
-		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
-
-		space_map_walk(sm, space_map_remove, allocmap);
-		space_map_walk(freed_map, space_map_remove, allocmap);
-
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_walk(&msp->ms_defermap[t],
-			    space_map_remove, allocmap);
-
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
-			    space_map_remove, allocmap);
-
-		mutex_exit(&msp->ms_lock);
-		space_map_truncate(smo, mos, tx);
-		mutex_enter(&msp->ms_lock);
+	/*
+	 * For sync pass 1, we avoid walking the entire space map and
+	 * instead will just swap the pointers for freemap and
+	 * freed_map. We can safely do this since the freed_map is
+	 * guaranteed to be empty on the initial pass.
+	 */
+	if (spa_sync_pass(spa) == 1) {
+		ASSERT0((*freed_map)->sm_space);
+		ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
+		space_map_swap(freemap, freed_map);
+	} else {
+		space_map_vacate(*freemap, space_map_add, *freed_map);
 	}
 
-	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
-	space_map_sync(freemap, SM_FREE, smo, mos, tx);
+	ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
+	ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
 
 	mutex_exit(&msp->ms_lock);
 
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+	VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	ASSERT3U(db->db_size, >=, sizeof (*smo));
 	bcopy(smo, db->db_data, sizeof (*smo));
@@ -1043,9 +1212,9 @@
 {
 	space_map_obj_t *smo = &msp->ms_smo;
 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
-	space_map_t *sm = &msp->ms_map;
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
+	space_map_t *sm = msp->ms_map;
+	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	int64_t alloc_delta, defer_delta;
@@ -1056,41 +1225,58 @@
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
-	 * allocmaps and freemaps and add its capacity to the vdev.
+	 * allocmaps, freemaps, and defermap and add its capacity to the vdev.
 	 */
-	if (freed_map->sm_size == 0) {
+	if (*freed_map == NULL) {
+		ASSERT(*defer_map == NULL);
 		for (int t = 0; t < TXG_SIZE; t++) {
-			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+			msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_allocmap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-			space_map_create(&msp->ms_freemap[t], sm->sm_start,
+			msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_create(&msp->ms_defermap[t], sm->sm_start,
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_defermap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+		}
 
+		freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+		defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
+
 		vdev_space_update(vd, 0, 0, sm->sm_size);
 	}
 
 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
-	defer_delta = freed_map->sm_space - defer_map->sm_space;
+	defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
 
 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
-	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
-	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+	ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
+	ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
 
 	/*
 	 * If there's a space_map_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
-	 * Then, add defer_map (oldest deferred frees) to this map and
-	 * transfer freed_map (this txg's frees) to defer_map.
 	 */
 	space_map_load_wait(sm);
-	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
-	space_map_vacate(freed_map, space_map_add, defer_map);
 
+	/*
+	 * Move the frees from the defer_map to this map (if it's loaded).
+	 * Swap the freed_map and the defer_map -- this is safe to do
+	 * because we've just emptied out the defer_map.
+	 */
+	space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	ASSERT0((*defer_map)->sm_space);
+	ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
+	space_map_swap(freed_map, defer_map);
+
 	*smo = *smosync;
 
 	msp->ms_deferspace += defer_delta;
@@ -1113,7 +1299,7 @@
 		int evictable = 1;
 
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+			if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
 				evictable = 0;
 
 		if (evictable && !metaslab_debug)
@@ -1138,7 +1324,7 @@
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
-		if (msp->ms_map.sm_start > mg->mg_bonus_area)
+		if (msp->ms_map->sm_start > mg->mg_bonus_area)
 			break;
 
 		mutex_enter(&msp->ms_lock);
@@ -1159,7 +1345,7 @@
 {
 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-	uint64_t start = msp->ms_map.sm_start >> ms_shift;
+	uint64_t start = msp->ms_map->sm_start >> ms_shift;
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (1ULL << 63);
@@ -1207,6 +1393,13 @@
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
+
+			/*
+			 * If the selected metaslab is condensing, skip it.
+			 */
+			if (msp->ms_map->sm_condensing)
+				continue;
+
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1272,20 +1465,30 @@
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
+		/*
+		 * If this metaslab is currently condensing then pick again as
+		 * we can't manipulate this metaslab until it's committed
+		 * to disk.
+		 */
+		if (msp->ms_map->sm_condensing) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
 			break;
 
 		atomic_inc_64(&mg->mg_alloc_failures);
 
-		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
+		metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
 	}
 
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+	if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
+	space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1517,13 +1720,13 @@
 	mutex_enter(&msp->ms_lock);
 
 	if (now) {
-		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+		space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
 		    offset, size);
-		space_map_free(&msp->ms_map, offset, size);
+		space_map_free(msp->ms_map, offset, size);
 	} else {
-		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+		if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+		space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1558,10 +1761,10 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+	if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
 		error = ENOENT;
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
@@ -1569,12 +1772,12 @@
 		return (error);
 	}
 
-	space_map_claim(&msp->ms_map, offset, size);
+	space_map_claim(msp->ms_map, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+		space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -110,7 +110,7 @@
 int64_t
 refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
 {
-	reference_t *ref;
+	reference_t *ref = NULL;
 	int64_t count;
 
 	if (reference_tracking_enable) {

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -660,7 +660,8 @@
 	int buf_space;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
-	int hdrsize, spillhdrsize;
+	int hdrsize;
+	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
@@ -837,7 +838,7 @@
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
-	uint64_t sa_reg_count;
+	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
@@ -1645,7 +1646,8 @@
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
-	int bonus_data_size, spill_data_size;
+	int bonus_data_size = 0;
+	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  */
 
 /*
@@ -141,6 +142,10 @@
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
+#ifndef illumos
+extern void spa_deadman(void *arg);
+#endif
+
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
@@ -383,7 +388,7 @@
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
-	uint64_t objnum;
+	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
@@ -1389,6 +1394,7 @@
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
+		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
@@ -4702,7 +4708,7 @@
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
-	uint64_t unspare_guid;
+	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
@@ -6257,6 +6263,17 @@
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
+	spa->spa_sync_starttime = gethrtime();
+#ifdef illumos
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
+#else	/* FreeBSD */
+#ifdef _KERNEL
+	callout_reset(&spa->spa_deadman_cycid,
+	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
+#endif
+#endif
+
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
@@ -6385,6 +6402,14 @@
 	}
 	dmu_tx_commit(tx);
 
+#ifdef illumos
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else	/* FreeBSD */
+#ifdef _KERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif
+
 	/*
 	 * Clear the dirty config list.
 	 */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -22,10 +22,12 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
+#include <sys/spa_boot.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
@@ -253,8 +255,54 @@
 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
+extern int zfs_txg_synctime_ms;
 
 /*
+ * Expiration time in units of zfs_txg_synctime_ms. This value has two
+ * meanings. First it is used to determine when the spa_deadman logic
+ * should fire. By default the spa_deadman will fire if spa_sync has
+ * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * Secondly, the value determines if an I/O is considered "hung".
+ * Any I/O that has not completed in zfs_deadman_synctime is considered
+ * "hung" resulting in a system panic.
+ * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ */
+uint64_t zfs_deadman_synctime = 1000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN,
+    &zfs_deadman_synctime, 0,
+    "Stalled ZFS I/O expiration time in units of vfs.zfs.txg_synctime_ms");
+
+/*
+ * Default value of -1 for zfs_deadman_enabled is resolved in
+ * zfs_deadman_init()
+ */
+int zfs_deadman_enabled = -1;
+TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
+    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
+
+#ifndef illumos
+#ifdef _KERNEL
+static void
+zfs_deadman_init()
+{
+	/*
+	 * If we are not i386 or amd64 or in a virtual machine,
+	 * disable ZFS deadman thread by default
+	 */
+	if (zfs_deadman_enabled == -1) {
+#if defined(__amd64__) || defined(__i386__)
+		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
+#else
+		zfs_deadman_enabled = 0;
+#endif
+	}
+}
+#endif	/* _KERNEL */
+#endif	/* !illumos */
+
+/*
  * ==========================================================================
  * SPA config locking
  * ==========================================================================
@@ -422,6 +470,23 @@
 }
 
 /*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+	spa_t *spa = arg;
+
+	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+	    ++spa->spa_deadman_calls);
+	if (zfs_deadman_enabled)
+		vdev_deadman(spa->spa_root_vdev);
+}
+
+/*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
@@ -431,6 +496,10 @@
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
+#ifdef illumos
+	cyc_handler_t hdlr;
+	cyc_time_t when;
+#endif
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -462,6 +531,32 @@
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 
+#ifdef illumos
+	hdlr.cyh_func = spa_deadman;
+	hdlr.cyh_arg = spa;
+	hdlr.cyh_level = CY_LOW_LEVEL;
+#endif
+
+	spa->spa_deadman_synctime = zfs_deadman_synctime *
+	    zfs_txg_synctime_ms * MICROSEC;
+
+#ifdef illumos
+	/*
+	 * This determines how often we need to check for hung I/Os after
+	 * the cyclic has already fired. Since checking for hung I/Os is
+	 * an expensive operation we don't want to check too frequently.
+	 * Instead wait for 5 synctimes before checking again.
+	 */
+	when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+	when.cyt_when = CY_INFINITY;
+	mutex_enter(&cpu_lock);
+	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
+	mutex_exit(&cpu_lock);
+#else	/* !illumos */
+#ifdef _KERNEL
+	callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
+#endif
+#endif
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 
@@ -544,6 +639,18 @@
 	nvlist_free(spa->spa_load_info);
 	spa_config_set(spa, NULL);
 
+#ifdef illumos
+	mutex_enter(&cpu_lock);
+	if (spa->spa_deadman_cycid != CYCLIC_NONE)
+		cyclic_remove(spa->spa_deadman_cycid);
+	mutex_exit(&cpu_lock);
+	spa->spa_deadman_cycid = CYCLIC_NONE;
+#else	/* !illumos */
+#ifdef _KERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif
+
 	refcount_destroy(&spa->spa_refcount);
 
 	spa_config_lock_destroy(spa);
@@ -1511,6 +1618,12 @@
 }
 
 uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+	return (spa->spa_deadman_synctime);
+}
+
+uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
 	uint64_t asize = DVA_GET_ASIZE(dva);
@@ -1605,7 +1718,9 @@
 	spa_mode_global = mode;
 
 #ifdef illumos
-#ifndef _KERNEL
+#ifdef _KERNEL
+	spa_arch_init();
+#else
 	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
 		arc_procfd = open("/proc/self/ctl", O_WRONLY);
 		if (arc_procfd == -1) {
@@ -1629,6 +1744,11 @@
 	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
+#ifndef illumos
+#ifdef _KERNEL
+	zfs_deadman_init();
+#endif
+#endif	/* !illumos */
 }
 
 void

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -107,6 +107,7 @@
 	int merge_before, merge_after;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(!sm->sm_condensing);
 	VERIFY(size != 0);
 	VERIFY3U(start, >=, sm->sm_start);
 	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
@@ -174,6 +175,7 @@
 	int left_over, right_over;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(!sm->sm_condensing);
 	VERIFY(size != 0);
 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
@@ -243,6 +245,20 @@
 }
 
 void
+space_map_swap(space_map_t **msrc, space_map_t **mdst)
+{
+	space_map_t *sm;
+
+	ASSERT(MUTEX_HELD((*msrc)->sm_lock));
+	ASSERT0((*mdst)->sm_space);
+	ASSERT0(avl_numnodes(&(*mdst)->sm_root));
+
+	sm = *msrc;
+	*msrc = *mdst;
+	*mdst = sm;
+}
+
+void
 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
 	space_seg_t *ss;
@@ -423,9 +439,9 @@
 	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
-	void *cookie = NULL;
+	avl_tree_t *t = &sm->sm_root;
 	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len, delta, sm_space;
+	uint64_t bufsize, start, size, run_len, total, sm_space, nodes;
 	uint64_t *entry, *entry_map, *entry_map_end;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -454,13 +470,14 @@
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
-	delta = 0;
+	total = 0;
+	nodes = avl_numnodes(&sm->sm_root);
 	sm_space = sm->sm_space;
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
 		size = ss->ss_end - ss->ss_start;
 		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 
-		delta += size;
+		total += size;
 		size >>= sm->sm_shift;
 
 		while (size) {
@@ -482,7 +499,6 @@
 			start += run_len;
 			size -= run_len;
 		}
-		kmem_cache_free(space_seg_cache, ss);
 	}
 
 	if (entry != entry_map) {
@@ -498,12 +514,11 @@
 	 * Ensure that the space_map's accounting wasn't changed
 	 * while we were in the middle of writing it out.
 	 */
+	VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root));
 	VERIFY3U(sm->sm_space, ==, sm_space);
+	VERIFY3U(sm->sm_space, ==, total);
 
 	zio_buf_free(entry_map, bufsize);
-
-	sm->sm_space -= delta;
-	VERIFY0(sm->sm_space);
 }
 
 void

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -66,20 +66,38 @@
 };
 
 /*
- * Each metaslab's free space is tracked in space map object in the MOS,
- * which is only updated in syncing context.  Each time we sync a txg,
+ * Each metaslab maintains an in-core free map (ms_map) that contains the
+ * current list of free segments. As blocks are allocated, the allocated
+ * segment is removed from the ms_map and added to a per txg allocation map.
+ * As blocks are freed, they are added to the per txg free map. These per
+ * txg maps allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps.
+ *
+ * Each metaslab's free space is tracked in a space map object in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
  * we append the allocs and frees from that txg to the space map object.
  * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing.  Everything in ms_smo is always safe to allocate.
+ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ *
+ * To load the in-core free map we read the space map object from disk.
+ * This object contains a series of alloc and free records that are
+ * combined to make up the list of all free segments in this metaslab. These
+ * segments are represented in-core by the ms_map and are stored in an
+ * AVL tree.
+ *
+ * As the space map objects grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the space map object is
+ * zfs_condense_pct/100 times the size of the minimal on-disk representation,
+ * we rewrite it in its minimized form.
  */
 struct metaslab {
 	kmutex_t	ms_lock;	/* metaslab lock		*/
 	space_map_obj_t	ms_smo;		/* synced space map object	*/
 	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
-	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
-	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
-	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
-	space_map_t	ms_map;		/* in-core free space map	*/
+	space_map_t	*ms_allocmap[TXG_SIZE];	/* allocated this txg	*/
+	space_map_t	*ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	*ms_defermap[TXG_DEFER_SIZE];	/* deferred frees */
+	space_map_t	*ms_map;	/* in-core free space map	*/
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	metaslab_group_t *ms_group;	/* metaslab group		*/

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -599,6 +599,7 @@
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPA_BOOT_H
 #define	_SYS_SPA_BOOT_H
 
@@ -35,6 +39,8 @@
 extern char *spa_get_bootprop(char *prop);
 extern void spa_free_bootprop(char *prop);
 
+extern void spa_arch_init(void);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -230,6 +231,16 @@
 	uint64_t	spa_feat_for_write_obj;	/* required to write to pool */
 	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
 	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
+#ifdef illumos
+	cyclic_id_t	spa_deadman_cycid;	/* cyclic id */
+#else	/* FreeBSD */
+#ifdef _KERNEL
+	struct callout	spa_deadman_cycid;	/* callout id */
+#endif
+#endif	/* illumos */
+	uint64_t	spa_deadman_calls;	/* number of deadman calls */
+	uint64_t	spa_sync_starttime;	/* starting time fo spa_sync */
+	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -40,17 +40,17 @@
 typedef struct space_map_ops space_map_ops_t;
 
 typedef struct space_map {
-	avl_tree_t	sm_root;	/* AVL tree of map segments */
+	avl_tree_t	sm_root;	/* offset-ordered segment AVL tree */
 	uint64_t	sm_space;	/* sum of all segments in the map */
 	uint64_t	sm_start;	/* start of map */
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
-	uint8_t		sm_pad[3];	/* unused */
 	uint8_t		sm_loaded;	/* map loaded? */
 	uint8_t		sm_loading;	/* map loading? */
+	uint8_t		sm_condensing;	/* map condensing? */
 	kcondvar_t	sm_load_cv;	/* map load completion */
 	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
-	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
+	avl_tree_t	*sm_pp_root;	/* size-ordered, picker-private tree */
 	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
@@ -149,6 +149,7 @@
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
 extern boolean_t space_map_contains(space_map_t *sm,
     uint64_t start, uint64_t size);
+extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
 extern void space_map_walk(space_map_t *sm,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -80,6 +80,7 @@
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
 
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -104,6 +104,8 @@
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
 	avl_tree_t	vq_pending_tree;
+	uint64_t	vq_io_complete_ts;
+	uint64_t	vq_io_delta_ts;
 	kmutex_t	vq_lock;
 };
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -22,6 +22,10 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
@@ -88,6 +92,11 @@
 #include <sys/u8_textprep.h>
 #include <sys/fm/util.h>
 #include <sys/sunddi.h>
+#ifdef illumos
+#include <sys/cyclic.h>
+#else	/* FreeBSD */
+#include <sys/callout.h>
+#endif
 
 #include <machine/stdarg.h>
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -246,6 +246,8 @@
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
+	uint32_t	zi_cmd;
+	uint32_t	zi_pad;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
@@ -252,6 +254,16 @@
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
+typedef enum zinject_type {
+	ZINJECT_UNINITIALIZED,
+	ZINJECT_DATA_FAULT,
+	ZINJECT_DEVICE_FAULT,
+	ZINJECT_LABEL_FAULT,
+	ZINJECT_IGNORED_WRITES,
+	ZINJECT_PANIC,
+	ZINJECT_DELAY_IO,
+} zinject_type_t;
+
 typedef struct zfs_share {
 	uint64_t	z_exportdata;
 	uint64_t	z_sharedata;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -443,6 +443,7 @@
 
 	uint64_t	io_offset;
 	uint64_t	io_deadline;
+	uint64_t	io_timestamp;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
@@ -596,6 +597,7 @@
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1846,6 +1847,7 @@
 
 	space_map_truncate(smo, mos, tx);
 	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+	space_map_vacate(&smsync, NULL, NULL);
 
 	space_map_destroy(&smsync);
 
@@ -3173,3 +3175,44 @@
 	}
 	vdev_propagate_state(cvd);
 }
+
+void
+vdev_deadman(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		vdev_deadman(cvd);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_t *vq = &vd->vdev_queue;
+
+		mutex_enter(&vq->vq_lock);
+		if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+			spa_t *spa = vd->vdev_spa;
+			zio_t *fio;
+			uint64_t delta;
+
+			/*
+			 * Look at the head of all the pending queues,
+			 * if any I/O has been outstanding for longer than
+			 * the spa_deadman_synctime we panic the system.
+			 */
+			fio = avl_first(&vq->vq_pending_tree);
+			delta = ddi_get_lbolt64() - fio->io_timestamp;
+			if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
+				zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
+				    "delta %llu, last io %llu",
+				    fio->io_timestamp, delta,
+				    vq->vq_io_complete_ts);
+				fm_panic("I/O to pool '%s' appears to be "
+				    "hung on vdev guid %llu at '%s'.",
+				    spa_name(spa),
+				    (long long unsigned int) vd->vdev_guid,
+				    vd->vdev_path);
+			}
+		}
+		mutex_exit(&vq->vq_lock);
+	}
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
@@ -315,6 +319,7 @@
 		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
+		aio->io_timestamp = fio->io_timestamp;
 
 		nio = fio;
 		do {
@@ -386,7 +391,8 @@
 
 	mutex_enter(&vq->vq_lock);
 
-	zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+	zio->io_timestamp = ddi_get_lbolt64();
+	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
 	    zio->io_priority;
 
 	vdev_queue_io_add(vq, zio);
@@ -411,10 +417,16 @@
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 
+	if (zio_injection_enabled)
+		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
+
 	mutex_enter(&vq->vq_lock);
 
 	avl_remove(&vq->vq_pending_tree, zio);
 
+	vq->vq_io_complete_ts = ddi_get_lbolt64();
+	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
 	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
 		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
 		if (nio == NULL)

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -1198,7 +1198,8 @@
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
-	uint8_t log, val;
+	uint8_t log = 0;
+	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -220,7 +220,7 @@
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
-	uint64_t value;
+	uint64_t value = 0;
 	int shift = (integer_size-1)*8;
 	int len = num_integers;
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -51,7 +51,7 @@
 {
 	caddr_t end;
 	caddr_t ptr;
-	zfs_ace_t *zacep;
+	zfs_ace_t *zacep = NULL;
 	ace_t *acep;
 	uint16_t entry_type;
 	size_t entry_size;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -560,9 +560,9 @@
 	uint32_t fuid_idx = FUID_INDEX(id);
 	uint32_t rid;
 	idmap_stat status;
-	uint64_t idx;
+	uint64_t idx = 0;
 	zfs_fuid_t *zfuid = NULL;
-	zfs_fuid_info_t *fuidp;
+	zfs_fuid_info_t *fuidp = NULL;
 
 	/*
 	 * If POSIX ID, or entry is already a FUID then
@@ -587,6 +587,9 @@
 		if (fuidp == NULL)
 			return (UID_NOBODY);
 
+		VERIFY3U(type, >=, ZFS_OWNER);
+		VERIFY3U(type, <=, ZFS_ACE_GROUP);
+
 		switch (type) {
 		case ZFS_ACE_USER:
 		case ZFS_ACE_GROUP:
@@ -603,7 +606,7 @@
 			idx = FUID_INDEX(fuidp->z_fuid_group);
 			break;
 		};
-		domain = fuidp->z_domain_table[idx -1];
+		domain = fuidp->z_domain_table[idx - 1];
 	} else {
 		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
 			status = kidmap_getsidbyuid(crgetzone(cr), id,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel at dawidek.net>.
  * All rights reserved.
- * Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org>
+ * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
@@ -5312,12 +5312,14 @@
 	len = IOCPARM_LEN(cmd);
 
 	/*
-	 * Check if we have sufficient kernel memory allocated
-	 * for the zfs_cmd_t request.  Bail out if not so we
-	 * will not access undefined memory region.
+	 * Check if we are talking to supported older binaries
+	 * and translate zfs_cmd if necessary
 	 */
 	if (len < sizeof(zfs_cmd_t))
-		if (len == sizeof(zfs_cmd_v15_t)) {
+		if (len == sizeof(zfs_cmd_v28_t)) {
+			cflag = ZFS_CMD_COMPAT_V28;
+			vec = ZFS_IOC(cmd);
+		} else if (len == sizeof(zfs_cmd_v15_t)) {
 			cflag = ZFS_CMD_COMPAT_V15;
 			vec = zfs_ioctl_v15_to_v28[ZFS_IOC(cmd)];
 		} else
@@ -5332,6 +5334,11 @@
 			return (ENOTSUP);
 	}
 
+	/*
+	 * Check if we have sufficient kernel memory allocated
+	 * for the zfs_cmd_t request.  Bail out if not so we
+	 * will not access undefined memory region.
+	 */
 	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (EINVAL);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -243,7 +243,7 @@
 	itx_t *itx;
 	lr_create_t *lr;
 	lr_acl_create_t *lracl;
-	size_t aclsize;
+	size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
 	size_t xvatsize = 0;
 	size_t txsize;
 	xvattr_t *xvap = (xvattr_t *)vap;
@@ -273,7 +273,6 @@
 		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
 		lrsize = sizeof (*lr);
 	} else {
-		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
 		txsize =
 		    sizeof (lr_acl_create_t) + namesize + fuidsz +
 		    ZIL_ACE_LENGTH(aclsize) + xvatsize;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -463,7 +463,7 @@
 zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
 {
 	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *rl, *next;
+	rl_t *rl, *next = NULL;
 	uint64_t len;
 
 	/*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -389,11 +389,18 @@
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
-	int readonly, do_readonly = B_FALSE;
-	int setuid, do_setuid = B_FALSE;
-	int exec, do_exec = B_FALSE;
-	int xattr, do_xattr = B_FALSE;
-	int atime, do_atime = B_FALSE;
+	boolean_t readonly = B_FALSE;
+	boolean_t do_readonly = B_FALSE;
+	boolean_t setuid = B_FALSE;
+	boolean_t do_setuid = B_FALSE;
+	boolean_t exec = B_FALSE;
+	boolean_t do_exec = B_FALSE;
+	boolean_t devices = B_FALSE;
+	boolean_t do_devices = B_FALSE;
+	boolean_t xattr = B_FALSE;
+	boolean_t do_xattr = B_FALSE;
+	boolean_t atime = B_FALSE;
+	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -576,7 +576,7 @@
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	ssize_t		n, nbytes;
-	int		error;
+	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
@@ -736,9 +736,9 @@
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error;
+	int		error = 0;
 	arc_buf_t	*abuf;
-	iovec_t		*aiov;
+	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
@@ -2411,6 +2411,7 @@
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
+		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
@@ -2894,7 +2895,7 @@
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
-	uint_t		saved_mask;
+	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	2016-09-30 01:23:04 UTC (rev 8987)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -147,16 +148,10 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
-		/* Ignore device errors and panic injection */
-		if (handler->zi_record.zi_guid != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
-			continue;
-
 		/* If this handler matches, return EIO */
 		if (zio_match_handler(&zio->io_logical->io_bookmark,
 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
@@ -197,10 +192,7 @@
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
-		/* Ignore device only faults or panic injection */
-		if (handler->zi_record.zi_start == 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
 			continue;
 
 		/*
@@ -246,13 +238,7 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/*
-		 * Ignore label specific faults, panic injection
-		 * or fake writes
-		 */
-		if (handler->zi_record.zi_start != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -316,12 +302,10 @@
 	    handler = list_next(&inject_handlers, handler)) {
 
 		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
-		if (handler->zi_record.zi_duration == 0)
-			continue;
-
 		/*
 		 * Positive duration implies # of seconds, negative
 		 * a number of txgs
@@ -355,13 +339,10 @@
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (spa != handler->zi_spa)
+		if (spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
-		if (handler->zi_record.zi_duration == 0)
-			continue;
-
 		if (handler->zi_record.zi_duration > 0) {
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    handler->zi_record.zi_timer +
@@ -379,6 +360,34 @@
 	rw_exit(&inject_lock);
 }
 
+uint64_t
+zio_handle_io_delay(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	inject_handler_t *handler;
+	uint64_t seconds = 0;
+
+	if (zio_injection_enabled == 0)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+			continue;
+
+		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			seconds = handler->zi_record.zi_timer;
+			break;
+		}
+
+	}
+	rw_exit(&inject_lock);
+	return (seconds);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	2016-09-30 01:16:07 UTC (rev 8986)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	2016-09-30 01:23:04 UTC (rev 8987)
@@ -131,6 +131,8 @@
 	ZFS_PROP_REFRATIO,
 	ZFS_PROP_WRITTEN,
 	ZFS_PROP_CLONES,
+	ZFS_PROP_LOGICALUSED,
+	ZFS_PROP_LOGICALREFERENCED,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 



More information about the Midnightbsd-cvs mailing list