[Midnightbsd-cvs] src [10164] trunk/sys/cddl/contrib/opensolaris/uts: sync with freebsd

Fri Jun 1 18:46:42 EDT 2018

Revision: 10164
          http://svnweb.midnightbsd.org/src/?rev=10164
Author:   laffer1
Date:     2018-06-01 18:46:41 -0400 (Fri, 01 Jun 2018)
Log Message:
-----------
sync with freebsd

Modified Paths:
--------------
    trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
    trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
    trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c
    trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
    trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
    trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c

Added Paths:
-----------
    trunk/sys/cddl/contrib/opensolaris/uts/mips/
    trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/
    trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
    trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/
    trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
    trunk/sys/cddl/contrib/opensolaris/uts/powerpc/
    trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/
    trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
    trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/
    trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h

Property Changed:
----------------
    trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
===================================================================

--- trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+# $MidnightBSD$
 #
 # CDDL HEADER START
 #
@@ -21,7 +22,10 @@
 
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 #
 #
@@ -32,8 +36,10 @@
 ZFS_COMMON_OBJS +=		\
 	arc.o			\
 	bplist.o		\
+	blkptr.o		\
 	bpobj.o			\
 	bptree.o		\
+	bqueue.o		\
 	dbuf.o			\
 	ddt.o			\
 	ddt_zap.o		\
@@ -46,6 +52,7 @@
 	dmu_tx.o		\
 	dnode.o			\
 	dnode_sync.o		\
+	dsl_bookmark.o		\
 	dsl_dir.o		\
 	dsl_dataset.o		\
 	dsl_deadlist.o		\
@@ -62,6 +69,8 @@
 	lz4.o			\
 	lzjb.o			\
 	metaslab.o		\
+	multilist.o		\
+	range_tree.o		\
 	refcount.o		\
 	rrwlock.o		\
 	sa.o			\
@@ -72,6 +81,7 @@
 	spa_history.o		\
 	spa_misc.o		\
 	space_map.o		\
+	space_reftree.o		\
 	txg.o			\
 	uberblock.o		\
 	unique.o		\
@@ -126,6 +136,3 @@
 	zfs_vfsops.o		\
 	zfs_vnops.o		\
 	zvol.o
-
-ZUT_OBJS +=			\
-	zut.o


Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -18,16 +19,15 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 250484 2013-05-10 21:12:55Z pfg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 314667 2017-03-04 13:03:31Z avg $
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * DTrace - Dynamic Tracing for Solaris
  *
@@ -68,7 +68,7 @@
  * on capital-f functions.
  */
 #include <sys/errno.h>
-#if !defined(sun)
+#ifndef illumos
 #include <sys/time.h>
 #endif
 #include <sys/stat.h>
@@ -75,13 +75,13 @@
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #endif
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/strsubr.h>
 #endif
 #include <sys/sysmacros.h>
@@ -88,22 +88,22 @@
 #include <sys/dtrace_impl.h>
 #include <sys/atomic.h>
 #include <sys/cmn_err.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/mutex_impl.h>
 #include <sys/rwlock_impl.h>
 #endif
 #include <sys/ctf_api.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/panic.h>
 #include <sys/priv_impl.h>
 #endif
 #include <sys/policy.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/cred_impl.h>
 #include <sys/procfs_isa.h>
 #endif
 #include <sys/taskq.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/mkdev.h>
 #include <sys/kdi.h>
 #endif
@@ -110,11 +110,13 @@
 #include <sys/zone.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
+#include "strtolctype.h"
 
 /* FreeBSD includes: */
-#if !defined(sun)
+#ifndef illumos
 #include <sys/callout.h>
 #include <sys/ctype.h>
+#include <sys/eventhandler.h>
 #include <sys/limits.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
@@ -153,8 +155,8 @@
 int		dtrace_destructive_disallow = 0;
 dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 size_t		dtrace_difo_maxsize = (256 * 1024);
-dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
-size_t		dtrace_global_maxsize = (16 * 1024);
+dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
+size_t		dtrace_statvar_maxsize = (16 * 1024);
 size_t		dtrace_actions_max = (16 * 1024);
 size_t		dtrace_retain_max = 1024;
 dtrace_optval_t	dtrace_helper_actions_max = 128;
@@ -175,7 +177,7 @@
 dtrace_optval_t dtrace_jstackframes_default = 50;
 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 int		dtrace_msgdsize_max = 128;
-hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
+hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
 hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
 int		dtrace_devdepth_max = 32;
 int		dtrace_err_verbose;
@@ -183,6 +185,9 @@
 hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
+#ifndef illumos
+int		dtrace_memstr_max = 4096;
+#endif
 
 /*
  * DTrace External Variables
@@ -198,10 +203,10 @@
 /*
  * DTrace Internal Variables
  */
-#if defined(sun)
+#ifdef illumos
 static dev_info_t	*dtrace_devi;		/* device info */
 #endif
-#if defined(sun)
+#ifdef illumos
 static vmem_t		*dtrace_arena;		/* probe ID arena */
 static vmem_t		*dtrace_minor;		/* minor number arena */
 #else
@@ -214,7 +219,8 @@
 static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
 static int		dtrace_opens;		/* number of opens */
 static int		dtrace_helpers;		/* number of helpers */
-#if defined(sun)
+static int		dtrace_getf;		/* number of unpriv getf()s */
+#ifdef illumos
 static void		*dtrace_softstate;	/* softstate pointer */
 #endif
 static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
@@ -231,14 +237,18 @@
 static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
 static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
 static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
+static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
 static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
-#if !defined(sun)
+static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
+#ifndef illumos
 static struct mtx	dtrace_unr_mtx;
 MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
 int		dtrace_in_probe;	/* non-zero if executing a probe */
-#if defined(__i386__) || defined(__amd64__)
+#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
 uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
 #endif
+static eventhandler_tag	dtrace_kld_load_tag;
+static eventhandler_tag	dtrace_kld_unload_try_tag;
 #endif
 
 /*
@@ -275,10 +285,8 @@
 static kmutex_t		dtrace_provider_lock;	/* provider state lock */
 static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
 
-#if !defined(sun)
+#ifndef illumos
 /* XXX FreeBSD hacks. */
-static kmutex_t		mod_lock;
-
 #define cr_suid		cr_svuid
 #define cr_sgid		cr_svgid
 #define	ipaddr_t	in_addr_t
@@ -298,10 +306,11 @@
 #define PRIV_PROC_ZONE		(1 << 5)
 #define PRIV_ALL		~0
 
-SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
+SYSCTL_DECL(_debug_dtrace);
+SYSCTL_DECL(_kern_dtrace);
 #endif
 
-#if defined(sun)
+#ifdef illumos
 #define curcpu	CPU->cpu_id
 #endif
 
@@ -343,18 +352,23 @@
 
 /*
  * DTrace Helper Tracing Variables
+ *
+ * These variables should be set dynamically to enable helper tracing.  The
+ * only variables that should be set are dtrace_helptrace_enable (which should
+ * be set to a non-zero value to allocate helper tracing buffers on the next
+ * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
+ * non-zero value to deallocate helper tracing buffers on the next close of
+ * /dev/dtrace).  When (and only when) helper tracing is disabled, the
+ * buffer size may also be set via dtrace_helptrace_bufsize.
  */
-uint32_t dtrace_helptrace_next = 0;
-uint32_t dtrace_helptrace_nlocals;
-char	*dtrace_helptrace_buffer;
-int	dtrace_helptrace_bufsize = 512 * 1024;
+int			dtrace_helptrace_enable = 0;
+int			dtrace_helptrace_disable = 0;
+int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
+uint32_t		dtrace_helptrace_nlocals;
+static dtrace_helptrace_t *dtrace_helptrace_buffer;
+static uint32_t		dtrace_helptrace_next = 0;
+static int		dtrace_helptrace_wrapped = 0;
 
-#ifdef DEBUG
-int	dtrace_helptrace_enabled = 1;
-#else
-int	dtrace_helptrace_enabled = 0;
-#endif
-
 /*
  * DTrace Error Hashing
  *
@@ -411,7 +425,7 @@
  * no way for a global variable key signature to match a thread-local key
  * signature.
  */
-#if defined(sun)
+#ifdef illumos
 #define	DTRACE_TLS_THRKEY(where) { \
 	uint_t intr = 0; \
 	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
@@ -462,8 +476,8 @@
  * disallow all negative sizes.  Ranges of size 0 are allowed.
  */
 #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
-	((testaddr) - (baseaddr) < (basesz) && \
-	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
+	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 	(testaddr) + (testsz) >= (testaddr))
 
 /*
@@ -572,6 +586,8 @@
 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
     size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
+static int dtrace_priv_proc(dtrace_state_t *);
+static void dtrace_getf_barrier(void);
 
 /*
  * DTrace Probe Context Functions
@@ -596,7 +612,11 @@
 	va_list alist;
 
 	va_start(alist, format);
+#ifdef __FreeBSD__
+	vpanic(format, alist);
+#else
 	dtrace_vpanic(format, alist);
+#endif
 	va_end(alist);
 }
 
@@ -680,13 +700,33 @@
     dtrace_statvar_t **svars, int nsvars)
 {
 	int i;
+	size_t maxglobalsize, maxlocalsize;
 
+	if (nsvars == 0)
+		return (0);
+
+	maxglobalsize = dtrace_statvar_maxsize;
+	maxlocalsize = (maxglobalsize + sizeof (uint64_t)) * NCPU;
+
 	for (i = 0; i < nsvars; i++) {
 		dtrace_statvar_t *svar = svars[i];
+		uint8_t scope;
+		size_t size;
 
-		if (svar == NULL || svar->dtsv_size == 0)
+		if (svar == NULL || (size = svar->dtsv_size) == 0)
 			continue;
 
+		scope = svar->dtsv_var.dtdv_scope;
+
+		/*
+		 * We verify that our size is valid in the spirit of providing
+		 * defense in depth:  we want to prevent attackers from using
+		 * DTrace to escalate an orthogonal kernel heap corruption bug
+		 * into the ability to store to arbitrary locations in memory.
+		 */
+		VERIFY((scope == DIFV_SCOPE_GLOBAL && size < maxglobalsize) ||
+		    (scope == DIFV_SCOPE_LOCAL && size < maxlocalsize));
+
 		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 			return (1);
 	}
@@ -716,7 +756,7 @@
 	 * up both thread-local variables and any global dynamically-allocated
 	 * variables.
 	 */
-	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
+	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
 	    vstate->dtvs_dynvars.dtds_size)) {
 		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 		uintptr_t base = (uintptr_t)dstate->dtds_base +
@@ -783,6 +823,7 @@
     dtrace_vstate_t *vstate)
 {
 	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
+	file_t *fp;
 
 	/*
 	 * If we hold the privilege to read from kernel memory, then
@@ -800,10 +841,104 @@
 	/*
 	 * We're allowed to read from our own string table.
 	 */
-	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
+	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
 	    mstate->dtms_difo->dtdo_strlen))
 		return (1);
 
+	if (vstate->dtvs_state != NULL &&
+	    dtrace_priv_proc(vstate->dtvs_state)) {
+		proc_t *p;
+
+		/*
+		 * When we have privileges to the current process, there are
+		 * several context-related kernel structures that are safe to
+		 * read, even absent the privilege to read from kernel memory.
+		 * These reads are safe because these structures contain only
+		 * state that (1) we're permitted to read, (2) is harmless or
+		 * (3) contains pointers to additional kernel state that we're
+		 * not permitted to read (and as such, do not present an
+		 * opportunity for privilege escalation).  Finally (and
+		 * critically), because of the nature of their relation with
+		 * the current thread context, the memory associated with these
+		 * structures cannot change over the duration of probe context,
+		 * and it is therefore impossible for this memory to be
+		 * deallocated and reallocated as something else while it's
+		 * being operated upon.
+		 */
+		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
+			return (1);
+
+		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+		    sz, curthread->t_procp, sizeof (proc_t))) {
+			return (1);
+		}
+
+		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cred, sizeof (cred_t))) {
+			return (1);
+		}
+
+#ifdef illumos
+		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
+			return (1);
+		}
+
+		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+			return (1);
+		}
+#endif
+	}
+
+	if ((fp = mstate->dtms_getf) != NULL) {
+		uintptr_t psz = sizeof (void *);
+		vnode_t *vp;
+		vnodeops_t *op;
+
+		/*
+		 * When getf() returns a file_t, the enabling is implicitly
+		 * granted the (transient) right to read the returned file_t
+		 * as well as the v_path and v_op->vnop_name of the underlying
+		 * vnode.  These accesses are allowed after a successful
+		 * getf() because the members that they refer to cannot change
+		 * once set -- and the barrier logic in the kernel's closef()
+		 * path assures that the file_t and its referenced vode_t
+		 * cannot themselves be stale (that is, it impossible for
+		 * either dtms_getf itself or its f_vnode member to reference
+		 * freed memory).
+		 */
+		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
+			return (1);
+
+		if ((vp = fp->f_vnode) != NULL) {
+#ifdef illumos
+			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
+				return (1);
+			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
+			    vp->v_path, strlen(vp->v_path) + 1)) {
+				return (1);
+			}
+#endif
+
+			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
+				return (1);
+
+#ifdef illumos
+			if ((op = vp->v_op) != NULL &&
+			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+				return (1);
+			}
+
+			if (op != NULL && op->vnop_name != NULL &&
+			    DTRACE_INRANGE(addr, sz, op->vnop_name,
+			    strlen(op->vnop_name) + 1)) {
+				return (1);
+			}
+#endif
+		}
+	}
+
 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 	*illval = addr;
 	return (0);
@@ -863,6 +998,58 @@
 }
 
 /*
+ * Convert a string to a signed integer using safe loads.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static int64_t
+dtrace_strtoll(char *input, int base, size_t limit)
+{
+	uintptr_t pos = (uintptr_t)input;
+	int64_t val = 0;
+	int x;
+	boolean_t neg = B_FALSE;
+	char c, cc, ccc;
+	uintptr_t end = pos + limit;
+
+	/*
+	 * Consume any whitespace preceding digits.
+	 */
+	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
+		pos++;
+
+	/*
+	 * Handle an explicit sign if one is present.
+	 */
+	if (c == '-' || c == '+') {
+		if (c == '-')
+			neg = B_TRUE;
+		c = dtrace_load8(++pos);
+	}
+
+	/*
+	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
+	 * if present.
+	 */
+	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
+	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
+		pos += 2;
+		c = ccc;
+	}
+
+	/*
+	 * Read in contiguous digits until the first non-digit character.
+	 */
+	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
+	    c = dtrace_load8(++pos))
+		val = val * base + x;
+
+	return (neg ? -val : val);
+}
+
+/*
  * Compare two strings using safe loads.
  */
 static int
@@ -1174,7 +1361,7 @@
 static int
 dtrace_priv_proc_common_zone(dtrace_state_t *state)
 {
-#if defined(sun)
+#ifdef illumos
 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
 
 	/*
@@ -1183,8 +1370,7 @@
 	 */
 	ASSERT(s_cr != NULL);
 
-	if ((cr = CRED()) != NULL &&
-	    s_cr->cr_zone == cr->cr_zone)
+	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
 		return (1);
 
 	return (0);
@@ -1284,6 +1470,115 @@
 }
 
 /*
+ * Determine if the dte_cond of the specified ECB allows for processing of
+ * the current probe to continue.  Note that this routine may allow continued
+ * processing, but with access(es) stripped from the mstate's dtms_access
+ * field.
+ */
+static int
+dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
+    dtrace_ecb_t *ecb)
+{
+	dtrace_probe_t *probe = ecb->dte_probe;
+	dtrace_provider_t *prov = probe->dtpr_provider;
+	dtrace_pops_t *pops = &prov->dtpv_pops;
+	int mode = DTRACE_MODE_NOPRIV_DROP;
+
+	ASSERT(ecb->dte_cond);
+
+#ifdef illumos
+	if (pops->dtps_mode != NULL) {
+		mode = pops->dtps_mode(prov->dtpv_arg,
+		    probe->dtpr_id, probe->dtpr_arg);
+
+		ASSERT((mode & DTRACE_MODE_USER) ||
+		    (mode & DTRACE_MODE_KERNEL));
+		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
+		    (mode & DTRACE_MODE_NOPRIV_DROP));
+	}
+
+	/*
+	 * If the dte_cond bits indicate that this consumer is only allowed to
+	 * see user-mode firings of this probe, call the provider's dtps_mode()
+	 * entry point to check that the probe was fired while in a user
+	 * context.  If that's not the case, use the policy specified by the
+	 * provider to determine if we drop the probe or merely restrict
+	 * operation.
+	 */
+	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
+		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
+
+		if (!(mode & DTRACE_MODE_USER)) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+		}
+	}
+#endif
+
+	/*
+	 * This is more subtle than it looks. We have to be absolutely certain
+	 * that CRED() isn't going to change out from under us so it's only
+	 * legit to examine that structure if we're in constrained situations.
+	 * Currently, the only times we'll this check is if a non-super-user
+	 * has enabled the profile or syscall providers -- providers that
+	 * allow visibility of all processes. For the profile case, the check
+	 * above will ensure that we're examining a user context.
+	 */
+	if (ecb->dte_cond & DTRACE_COND_OWNER) {
+		cred_t *cr;
+		cred_t *s_cr = state->dts_cred.dcr_cred;
+		proc_t *proc;
+
+		ASSERT(s_cr != NULL);
+
+		if ((cr = CRED()) == NULL ||
+		    s_cr->cr_uid != cr->cr_uid ||
+		    s_cr->cr_uid != cr->cr_ruid ||
+		    s_cr->cr_uid != cr->cr_suid ||
+		    s_cr->cr_gid != cr->cr_gid ||
+		    s_cr->cr_gid != cr->cr_rgid ||
+		    s_cr->cr_gid != cr->cr_sgid ||
+		    (proc = ttoproc(curthread)) == NULL ||
+		    (proc->p_flag & SNOCD)) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+#ifdef illumos
+			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
+#endif
+		}
+	}
+
+#ifdef illumos
+	/*
+	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
+	 * in our zone, check to see if our mode policy is to restrict rather
+	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
+	 * and DTRACE_ACCESS_ARGS
+	 */
+	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
+		cred_t *cr;
+		cred_t *s_cr = state->dts_cred.dcr_cred;
+
+		ASSERT(s_cr != NULL);
+
+		if ((cr = CRED()) == NULL ||
+		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+			mstate->dtms_access &=
+			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
+		}
+	}
+#endif
+
+	return (1);
+}
+
+/*
  * Note:  not called from probe context.  This function is called
  * asynchronously (and at a regular interval) from outside of probe context to
  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
@@ -1294,13 +1589,13 @@
 {
 	dtrace_dynvar_t *dirty;
 	dtrace_dstate_percpu_t *dcpu;
-	int i, work = 0;
+	dtrace_dynvar_t **rinsep;
+	int i, j, work = 0;
 
 	for (i = 0; i < NCPU; i++) {
 		dcpu = &dstate->dtds_percpu[i];
+		rinsep = &dcpu->dtdsc_rinsing;
 
-		ASSERT(dcpu->dtdsc_rinsing == NULL);
-
 		/*
 		 * If the dirty list is NULL, there is no dirty work to do.
 		 */
@@ -1307,15 +1602,63 @@
 		if (dcpu->dtdsc_dirty == NULL)
 			continue;
 
-		/*
-		 * If the clean list is non-NULL, then we're not going to do
-		 * any work for this CPU -- it means that there has not been
-		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
-		 * since the last time we cleaned house.
-		 */
-		if (dcpu->dtdsc_clean != NULL)
+		if (dcpu->dtdsc_rinsing != NULL) {
+			/*
+			 * If the rinsing list is non-NULL, then it is because
+			 * this CPU was selected to accept another CPU's
+			 * dirty list -- and since that time, dirty buffers
+			 * have accumulated.  This is a highly unlikely
+			 * condition, but we choose to ignore the dirty
+			 * buffers -- they'll be picked up a future cleanse.
+			 */
 			continue;
+		}
 
+		if (dcpu->dtdsc_clean != NULL) {
+			/*
+			 * If the clean list is non-NULL, then we're in a
+			 * situation where a CPU has done deallocations (we
+			 * have a non-NULL dirty list) but no allocations (we
+			 * also have a non-NULL clean list).  We can't simply
+			 * move the dirty list into the clean list on this
+			 * CPU, yet we also don't want to allow this condition
+			 * to persist, lest a short clean list prevent a
+			 * massive dirty list from being cleaned (which in
+			 * turn could lead to otherwise avoidable dynamic
+			 * drops).  To deal with this, we look for some CPU
+			 * with a NULL clean list, NULL dirty list, and NULL
+			 * rinsing list -- and then we borrow this CPU to
+			 * rinse our dirty list.
+			 */
+			for (j = 0; j < NCPU; j++) {
+				dtrace_dstate_percpu_t *rinser;
+
+				rinser = &dstate->dtds_percpu[j];
+
+				if (rinser->dtdsc_rinsing != NULL)
+					continue;
+
+				if (rinser->dtdsc_dirty != NULL)
+					continue;
+
+				if (rinser->dtdsc_clean != NULL)
+					continue;
+
+				rinsep = &rinser->dtdsc_rinsing;
+				break;
+			}
+
+			if (j == NCPU) {
+				/*
+				 * We were unable to find another CPU that
+				 * could accept this dirty list -- we are
+				 * therefore unable to clean it now.
+				 */
+				dtrace_dynvar_failclean++;
+				continue;
+			}
+		}
+
 		work = 1;
 
 		/*
@@ -1331,7 +1674,7 @@
 			 * on a hash chain, either the dirty list or the
 			 * rinsing list for some CPU must be non-NULL.)
 			 */
-			dcpu->dtdsc_rinsing = dirty;
+			*rinsep = dirty;
 			dtrace_membar_producer();
 		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
 		    dirty, NULL) != dirty);
@@ -1762,7 +2105,7 @@
 			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
 
 			/*
-			 * Now we'll move the clean list to the free list.
+			 * Now we'll move the clean list to our free list.
 			 * It's impossible for this to fail:  the only way
 			 * the free list can be updated is through this
 			 * code path, and only one CPU can own the clean list.
@@ -1775,6 +2118,7 @@
 			 * owners of the clean lists out before resetting
 			 * the clean lists.
 			 */
+			dcpu = &dstate->dtds_percpu[me];
 			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
 			ASSERT(rval == NULL);
 			goto retry;
@@ -2345,9 +2689,10 @@
 {
 	dtrace_speculation_t *spec;
 	dtrace_buffer_t *src, *dest;
-	uintptr_t daddr, saddr, dlimit;
+	uintptr_t daddr, saddr, dlimit, slimit;
 	dtrace_speculation_state_t current, new = 0;
 	intptr_t offs;
+	uint64_t timestamp;
 
 	if (which == 0)
 		return;
@@ -2423,7 +2768,37 @@
 	}
 
 	/*
-	 * We have the space; copy the buffer across.  (Note that this is a
+	 * We have sufficient space to copy the speculative buffer into the
+	 * primary buffer.  First, modify the speculative buffer, filling
+	 * in the timestamp of all entries with the current time.  The data
+	 * must have the commit() time rather than the time it was traced,
+	 * so that all entries in the primary buffer are in timestamp order.
+	 */
+	timestamp = dtrace_gethrtime();
+	saddr = (uintptr_t)src->dtb_tomax;
+	slimit = saddr + src->dtb_offset;
+	while (saddr < slimit) {
+		size_t size;
+		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+			saddr += sizeof (dtrace_epid_t);
+			continue;
+		}
+		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
+		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+		ASSERT3U(saddr + size, <=, slimit);
+		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
+		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
+
+		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+		saddr += size;
+	}
+
+	/*
+	 * Copy the buffer across.  (Note that this is a
 	 * highly subobtimal bcopy(); in the unlikely event that this becomes
 	 * a serious performance issue, a high-performance DTrace-specific
 	 * bcopy() should obviously be invented.)
@@ -2836,7 +3211,7 @@
 
 		return (mstate->dtms_arg[ndx]);
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_VAR_UREGS: {
 		klwp_t *lwp;
 
@@ -2870,7 +3245,7 @@
 #endif
 
 	case DIF_VAR_CURTHREAD:
-		if (!dtrace_priv_kernel(state))
+		if (!dtrace_priv_proc(state))
 			return (0);
 		return ((uint64_t)(uintptr_t)curthread);
 
@@ -2892,7 +3267,7 @@
 		}
 		return (mstate->dtms_walltimestamp);
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_VAR_IPL:
 		if (!dtrace_priv_kernel(state))
 			return (0);
@@ -3029,7 +3404,7 @@
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * Note that we are assuming that an unanchored probe is
 		 * always due to a high-level interrupt.  (And we're assuming
@@ -3055,7 +3430,7 @@
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3070,11 +3445,14 @@
 		 */
 		return ((uint64_t)curthread->t_procp->p_ppid);
 #else
-		return ((uint64_t)curproc->p_pptr->p_pid);
+		if (curproc->p_pid == proc0.p_pid)
+			return (curproc->p_pid);
+		else
+			return (curproc->p_pptr->p_pid);
 #endif
 
 	case DIF_VAR_TID:
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3095,7 +3473,7 @@
 	}
 
 	case DIF_VAR_EXECNAME:
-#if defined(sun)
+#ifdef illumos
 		if (!dtrace_priv_proc(state))
 			return (0);
 
@@ -3120,7 +3498,7 @@
 #endif
 
 	case DIF_VAR_ZONENAME:
-#if defined(sun)
+#ifdef illumos
 		if (!dtrace_priv_proc(state))
 			return (0);
 
@@ -3147,13 +3525,12 @@
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 			return ((uint64_t)p0.p_cred->cr_uid);
-#endif
 
 		/*
 		 * It is always safe to dereference one's own t_procp pointer:
@@ -3165,18 +3542,20 @@
 		 * credential, since this is never NULL after process birth.
 		 */
 		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
+#else
+		return ((uint64_t)curthread->td_ucred->cr_uid);
+#endif
 
 	case DIF_VAR_GID:
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 			return ((uint64_t)p0.p_cred->cr_gid);
-#endif
 
 		/*
 		 * It is always safe to dereference one's own t_procp pointer:
@@ -3188,9 +3567,12 @@
 		 * credential, since this is never NULL after process birth.
 		 */
 		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
+#else
+		return ((uint64_t)curthread->td_ucred->cr_gid);
+#endif
 
 	case DIF_VAR_ERRNO: {
-#if defined(sun)
+#ifdef illumos
 		klwp_t *lwp;
 		if (!dtrace_priv_proc(state))
 			return (0);
@@ -3215,7 +3597,7 @@
 		return (curthread->td_errno);
 #endif
 	}
-#if !defined(sun)
+#ifndef illumos
 	case DIF_VAR_CPU: {
 		return curcpu;
 	}
@@ -3226,7 +3608,464 @@
 	}
 }
 
+
+typedef enum dtrace_json_state {
+	DTRACE_JSON_REST = 1,
+	DTRACE_JSON_OBJECT,
+	DTRACE_JSON_STRING,
+	DTRACE_JSON_STRING_ESCAPE,
+	DTRACE_JSON_STRING_ESCAPE_UNICODE,
+	DTRACE_JSON_COLON,
+	DTRACE_JSON_COMMA,
+	DTRACE_JSON_VALUE,
+	DTRACE_JSON_IDENTIFIER,
+	DTRACE_JSON_NUMBER,
+	DTRACE_JSON_NUMBER_FRAC,
+	DTRACE_JSON_NUMBER_EXP,
+	DTRACE_JSON_COLLECT_OBJECT
+} dtrace_json_state_t;
+
 /*
+ * This function possesses just enough knowledge about JSON to extract a single
+ * value from a JSON string and store it in the scratch buffer.  It is able
+ * to extract nested object values, and members of arrays by index.
+ *
+ * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
+ * be looked up as we descend into the object tree.  e.g.
+ *
+ *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
+ *       with nelems = 5.
+ *
+ * The run time of this function must be bounded above by strsize to limit the
+ * amount of work done in probe context.  As such, it is implemented as a
+ * simple state machine, reading one character at a time using safe loads
+ * until we find the requested element, hit a parsing error or run off the
+ * end of the object or string.
+ *
+ * As there is no way for a subroutine to return an error without interrupting
+ * clause execution, we simply return NULL in the event of a missing key or any
+ * other error condition.  Each NULL return in this function is commented with
+ * the error condition it represents -- parsing or otherwise.
+ *
+ * The set of states for the state machine closely matches the JSON
+ * specification (http://json.org/).  Briefly:
+ *
+ *   DTRACE_JSON_REST:
+ *     Skip whitespace until we find either a top-level Object, moving
+ *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_OBJECT:
+ *     Locate the next key String in an Object.  Sets a flag to denote
+ *     the next String as a key string and moves to DTRACE_JSON_STRING.
+ *
+ *   DTRACE_JSON_COLON:
+ *     Skip whitespace until we find the colon that separates key Strings
+ *     from their values.  Once found, move to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_VALUE:
+ *     Detects the type of the next value (String, Number, Identifier, Object
+ *     or Array) and routes to the states that process that type.  Here we also
+ *     deal with the element selector list if we are requested to traverse down
+ *     into the object tree.
+ *
+ *   DTRACE_JSON_COMMA:
+ *     Skip whitespace until we find the comma that separates key-value pairs
+ *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
+ *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
+ *     states return to this state at the end of their value, unless otherwise
+ *     noted.
+ *
+ *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
+ *     Processes a Number literal from the JSON, including any exponent
+ *     component that may be present.  Numbers are returned as strings, which
+ *     may be passed to strtoll() if an integer is required.
+ *
+ *   DTRACE_JSON_IDENTIFIER:
+ *     Processes a "true", "false" or "null" literal in the JSON.
+ *
+ *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
+ *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ *     Processes a String literal from the JSON, whether the String denotes
+ *     a key, a value or part of a larger Object.  Handles all escape sequences
+ *     present in the specification, including four-digit unicode characters,
+ *     but merely includes the escape sequence without converting it to the
+ *     actual escaped character.  If the String is flagged as a key, we
+ *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
+ *
+ *   DTRACE_JSON_COLLECT_OBJECT:
+ *     This state collects an entire Object (or Array), correctly handling
+ *     embedded strings.  If the full element selector list matches this nested
+ *     object, we return the Object in full as a string.  If not, we use this
+ *     state to skip to the next value at this level and continue processing.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static char *
+dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
+    char *dest)
+{
+	dtrace_json_state_t state = DTRACE_JSON_REST;
+	int64_t array_elem = INT64_MIN;
+	int64_t array_pos = 0;
+	uint8_t escape_unicount = 0;
+	boolean_t string_is_key = B_FALSE;
+	boolean_t collect_object = B_FALSE;
+	boolean_t found_key = B_FALSE;
+	boolean_t in_array = B_FALSE;
+	uint32_t braces = 0, brackets = 0;
+	char *elem = elemlist;
+	char *dd = dest;
+	uintptr_t cur;
+
+	for (cur = json; cur < json + size; cur++) {
+		char cc = dtrace_load8(cur);
+		if (cc == '\0')
+			return (NULL);
+
+		switch (state) {
+		case DTRACE_JSON_REST:
+			if (isspace(cc))
+				break;
+
+			if (cc == '{') {
+				state = DTRACE_JSON_OBJECT;
+				break;
+			}
+
+			if (cc == '[') {
+				in_array = B_TRUE;
+				array_pos = 0;
+				array_elem = dtrace_strtoll(elem, 10, size);
+				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
+				state = DTRACE_JSON_VALUE;
+				break;
+			}
+
+			/*
+			 * ERROR: expected to find a top-level object or array.
+			 */
+			return (NULL);
+		case DTRACE_JSON_OBJECT:
+			if (isspace(cc))
+				break;
+
+			if (cc == '"') {
+				state = DTRACE_JSON_STRING;
+				string_is_key = B_TRUE;
+				break;
+			}
+
+			/*
+			 * ERROR: either the object did not start with a key
+			 * string, or we've run off the end of the object
+			 * without finding the requested key.
+			 */
+			return (NULL);
+		case DTRACE_JSON_STRING:
+			if (cc == '\\') {
+				*dd++ = '\\';
+				state = DTRACE_JSON_STRING_ESCAPE;
+				break;
+			}
+
+			if (cc == '"') {
+				if (collect_object) {
+					/*
+					 * We don't reset the dest here, as
+					 * the string is part of a larger
+					 * object being collected.
+					 */
+					*dd++ = cc;
+					collect_object = B_FALSE;
+					state = DTRACE_JSON_COLLECT_OBJECT;
+					break;
+				}
+				*dd = '\0';
+				dd = dest; /* reset string buffer */
+				if (string_is_key) {
+					if (dtrace_strncmp(dest, elem,
+					    size) == 0)
+						found_key = B_TRUE;
+				} else if (found_key) {
+					if (nelems > 1) {
+						/*
+						 * We expected an object, not
+						 * this string.
+						 */
+						return (NULL);
+					}
+					return (dest);
+				}
+				state = string_is_key ? DTRACE_JSON_COLON :
+				    DTRACE_JSON_COMMA;
+				string_is_key = B_FALSE;
+				break;
+			}
+
+			*dd++ = cc;
+			break;
+		case DTRACE_JSON_STRING_ESCAPE:
+			*dd++ = cc;
+			if (cc == 'u') {
+				escape_unicount = 0;
+				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
+			} else {
+				state = DTRACE_JSON_STRING;
+			}
+			break;
+		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
+			if (!isxdigit(cc)) {
+				/*
+				 * ERROR: invalid unicode escape, expected
+				 * four valid hexidecimal digits.
+				 */
+				return (NULL);
+			}
+
+			*dd++ = cc;
+			if (++escape_unicount == 4)
+				state = DTRACE_JSON_STRING;
+			break;
+		case DTRACE_JSON_COLON:
+			if (isspace(cc))
+				break;
+
+			if (cc == ':') {
+				state = DTRACE_JSON_VALUE;
+				break;
+			}
+
+			/*
+			 * ERROR: expected a colon.
+			 */
+			return (NULL);
+		case DTRACE_JSON_COMMA:
+			if (isspace(cc))
+				break;
+
+			if (cc == ',') {
+				if (in_array) {
+					state = DTRACE_JSON_VALUE;
+					if (++array_pos == array_elem)
+						found_key = B_TRUE;
+				} else {
+					state = DTRACE_JSON_OBJECT;
+				}
+				break;
+			}
+
+			/*
+			 * ERROR: either we hit an unexpected character, or
+			 * we reached the end of the object or array without
+			 * finding the requested key.
+			 */
+			return (NULL);
+		case DTRACE_JSON_IDENTIFIER:
+			if (islower(cc)) {
+				*dd++ = cc;
+				break;
+			}
+
+			*dd = '\0';
+			dd = dest; /* reset string buffer */
+
+			if (dtrace_strncmp(dest, "true", 5) == 0 ||
+			    dtrace_strncmp(dest, "false", 6) == 0 ||
+			    dtrace_strncmp(dest, "null", 5) == 0) {
+				if (found_key) {
+					if (nelems > 1) {
+						/*
+						 * ERROR: We expected an object,
+						 * not this identifier.
+						 */
+						return (NULL);
+					}
+					return (dest);
+				} else {
+					cur--;
+					state = DTRACE_JSON_COMMA;
+					break;
+				}
+			}
+
+			/*
+			 * ERROR: we did not recognise the identifier as one
+			 * of those in the JSON specification.
+			 */
+			return (NULL);
+		case DTRACE_JSON_NUMBER:
+			if (cc == '.') {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER_FRAC;
+				break;
+			}
+
+			if (cc == 'x' || cc == 'X') {
+				/*
+				 * ERROR: specification explicitly excludes
+				 * hexidecimal or octal numbers.
+				 */
+				return (NULL);
+			}
+
+			/* FALLTHRU */
+		case DTRACE_JSON_NUMBER_FRAC:
+			if (cc == 'e' || cc == 'E') {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER_EXP;
+				break;
+			}
+
+			if (cc == '+' || cc == '-') {
+				/*
+				 * ERROR: expect sign as part of exponent only.
+				 */
+				return (NULL);
+			}
+			/* FALLTHRU */
+		case DTRACE_JSON_NUMBER_EXP:
+			if (isdigit(cc) || cc == '+' || cc == '-') {
+				*dd++ = cc;
+				break;
+			}
+
+			*dd = '\0';
+			dd = dest; /* reset string buffer */
+			if (found_key) {
+				if (nelems > 1) {
+					/*
+					 * ERROR: We expected an object, not
+					 * this number.
+					 */
+					return (NULL);
+				}
+				return (dest);
+			}
+
+			cur--;
+			state = DTRACE_JSON_COMMA;
+			break;
+		case DTRACE_JSON_VALUE:
+			if (isspace(cc))
+				break;
+
+			if (cc == '{' || cc == '[') {
+				if (nelems > 1 && found_key) {
+					in_array = cc == '[' ? B_TRUE : B_FALSE;
+					/*
+					 * If our element selector directs us
+					 * to descend into this nested object,
+					 * then move to the next selector
+					 * element in the list and restart the
+					 * state machine.
+					 */
+					while (*elem != '\0')
+						elem++;
+					elem++; /* skip the inter-element NUL */
+					nelems--;
+					dd = dest;
+					if (in_array) {
+						state = DTRACE_JSON_VALUE;
+						array_pos = 0;
+						array_elem = dtrace_strtoll(
+						    elem, 10, size);
+						found_key = array_elem == 0 ?
+						    B_TRUE : B_FALSE;
+					} else {
+						found_key = B_FALSE;
+						state = DTRACE_JSON_OBJECT;
+					}
+					break;
+				}
+
+				/*
+				 * Otherwise, we wish to either skip this
+				 * nested object or return it in full.
+				 */
+				if (cc == '[')
+					brackets = 1;
+				else
+					braces = 1;
+				*dd++ = cc;
+				state = DTRACE_JSON_COLLECT_OBJECT;
+				break;
+			}
+
+			if (cc == '"') {
+				state = DTRACE_JSON_STRING;
+				break;
+			}
+
+			if (islower(cc)) {
+				/*
+				 * Here we deal with true, false and null.
+				 */
+				*dd++ = cc;
+				state = DTRACE_JSON_IDENTIFIER;
+				break;
+			}
+
+			if (cc == '-' || isdigit(cc)) {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER;
+				break;
+			}
+
+			/*
+			 * ERROR: unexpected character at start of value.
+			 */
+			return (NULL);
+		case DTRACE_JSON_COLLECT_OBJECT:
+			if (cc == '\0')
+				/*
+				 * ERROR: unexpected end of input.
+				 */
+				return (NULL);
+
+			*dd++ = cc;
+			if (cc == '"') {
+				collect_object = B_TRUE;
+				state = DTRACE_JSON_STRING;
+				break;
+			}
+
+			if (cc == ']') {
+				if (brackets-- == 0) {
+					/*
+					 * ERROR: unbalanced brackets.
+					 */
+					return (NULL);
+				}
+			} else if (cc == '}') {
+				if (braces-- == 0) {
+					/*
+					 * ERROR: unbalanced braces.
+					 */
+					return (NULL);
+				}
+			} else if (cc == '{') {
+				braces++;
+			} else if (cc == '[') {
+				brackets++;
+			}
+
+			if (brackets == 0 && braces == 0) {
+				if (found_key) {
+					*dd = '\0';
+					return (dest);
+				}
+				dd = dest; /* reset string buffer */
+				state = DTRACE_JSON_COMMA;
+			}
+			break;
+		}
+	}
+	return (NULL);
+}
+
+/*
  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
  * Notice that we don't bother validating the proper number of arguments or
  * their types in the tuple stack.  This isn't needed because all argument
@@ -3242,7 +4081,7 @@
 	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
 	dtrace_vstate_t *vstate = &state->dts_vstate;
 
-#if defined(sun)
+#ifdef illumos
 	union {
 		mutex_impl_t mi;
 		uint64_t mx;
@@ -3265,7 +4104,7 @@
 		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
 		break;
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_MUTEX_OWNED:
 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 		    mstate, vstate)) {
@@ -3353,7 +4192,7 @@
 		regs[rd] = _RW_ISWRITER(&r.ri);
 		break;
 
-#else
+#else /* !illumos */
 	case DIF_SUBR_MUTEX_OWNED:
 		if (!dtrace_canload(tupregs[0].dttk_value,
 			sizeof (struct lock_object), mstate, vstate)) {
@@ -3432,7 +4271,7 @@
 		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
 		    lowner != NULL;
 		break;
-#endif /* ! defined(sun) */
+#endif /* illumos */
 
 	case DIF_SUBR_BCOPY: {
 		/*
@@ -3542,7 +4381,7 @@
 		break;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_MSGSIZE:
 	case DIF_SUBR_MSGDSIZE: {
 		uintptr_t baddr = tupregs[0].dttk_value, daddr;
@@ -3610,7 +4449,7 @@
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 
 		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
-#if defined(sun)
+#ifdef illumos
 			if (p->p_pidp->pid_id == pid) {
 #else
 			if (p->p_pid == pid) {
@@ -3637,7 +4476,8 @@
 
 		if (!dtrace_destructive_disallow &&
 		    dtrace_priv_proc_control(state) &&
-		    !dtrace_istoxic(kaddr, size)) {
+		    !dtrace_istoxic(kaddr, size) &&
+		    dtrace_canload(kaddr, size, mstate, vstate)) {
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			dtrace_copyout(kaddr, uaddr, size, flags);
 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
@@ -3652,7 +4492,8 @@
 
 		if (!dtrace_destructive_disallow &&
 		    dtrace_priv_proc_control(state) &&
-		    !dtrace_istoxic(kaddr, size)) {
+		    !dtrace_istoxic(kaddr, size) &&
+		    dtrace_strcanload(kaddr, size, mstate, vstate)) {
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			dtrace_copyoutstr(kaddr, uaddr, size, flags);
 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
@@ -3977,7 +4818,7 @@
 		int64_t index = (int64_t)tupregs[1].dttk_value;
 		int64_t remaining = (int64_t)tupregs[2].dttk_value;
 		size_t len = dtrace_strlen((char *)s, size);
-		int64_t i = 0;
+		int64_t i;
 
 		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
 			regs[rd] = 0;
@@ -4022,6 +4863,65 @@
 		break;
 	}
 
+	case DIF_SUBR_JSON: {
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		uintptr_t json = tupregs[0].dttk_value;
+		size_t jsonlen = dtrace_strlen((char *)json, size);
+		uintptr_t elem = tupregs[1].dttk_value;
+		size_t elemlen = dtrace_strlen((char *)elem, size);
+
+		char *dest = (char *)mstate->dtms_scratch_ptr;
+		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
+		char *ee = elemlist;
+		int nelems = 1;
+		uintptr_t cur;
+
+		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
+		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+
+		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+			regs[rd] = 0;
+			break;
+		}
+
+		/*
+		 * Read the element selector and split it up into a packed list
+		 * of strings.
+		 */
+		for (cur = elem; cur < elem + elemlen; cur++) {
+			char cc = dtrace_load8(cur);
+
+			if (cur == elem && cc == '[') {
+				/*
+				 * If the first element selector key is
+				 * actually an array index then ignore the
+				 * bracket.
+				 */
+				continue;
+			}
+
+			if (cc == ']')
+				continue;
+
+			if (cc == '.' || cc == '[') {
+				nelems++;
+				cc = '\0';
+			}
+
+			*ee++ = cc;
+		}
+		*ee++ = '\0';
+
+		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
+		    nelems, dest)) != 0)
+			mstate->dtms_scratch_ptr += jsonlen + 1;
+		break;
+	}
+
 	case DIF_SUBR_TOUPPER:
 	case DIF_SUBR_TOLOWER: {
 		uintptr_t s = tupregs[0].dttk_value;
@@ -4069,7 +4969,7 @@
 		break;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_GETMAJOR:
 #ifdef _LP64
 		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
@@ -4331,6 +5231,28 @@
 		break;
 	}
 
+	case DIF_SUBR_STRTOLL: {
+		uintptr_t s = tupregs[0].dttk_value;
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		int base = 10;
+
+		if (nargs > 1) {
+			if ((base = tupregs[1].dttk_value) <= 1 ||
+			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+				*flags |= CPU_DTRACE_ILLOP;
+				break;
+			}
+		}
+
+		if (!dtrace_strcanload(s, size, mstate, vstate)) {
+			regs[rd] = INT64_MIN;
+			break;
+		}
+
+		regs[rd] = dtrace_strtoll((char *)s, base, size);
+		break;
+	}
+
 	case DIF_SUBR_LLTOSTR: {
 		int64_t i = (int64_t)tupregs[0].dttk_value;
 		uint64_t val, digit;
@@ -4540,11 +5462,32 @@
 		break;
 	}
 
+	case DIF_SUBR_GETF: {
+		uintptr_t fd = tupregs[0].dttk_value;
+		struct filedesc *fdp;
+		file_t *fp;
+
+		if (!dtrace_priv_proc(state)) {
+			regs[rd] = 0;
+			break;
+		}
+		fdp = curproc->p_fd;
+		FILEDESC_SLOCK(fdp);
+		fp = fget_locked(fdp, fd);
+		mstate->dtms_getf = fp;
+		regs[rd] = (uintptr_t)fp;
+		FILEDESC_SUNLOCK(fdp);
+		break;
+	}
+
 	case DIF_SUBR_CLEANPATH: {
 		char *dest = (char *)mstate->dtms_scratch_ptr, c;
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 		uintptr_t src = tupregs[0].dttk_value;
 		int i = 0, j = 0;
+#ifdef illumos
+		zone_t *z;
+#endif
 
 		if (!dtrace_strcanload(src, size, mstate, vstate)) {
 			regs[rd] = 0;
@@ -4643,6 +5586,25 @@
 		} while (c != '\0');
 
 		dest[j] = '\0';
+
+#ifdef illumos
+		if (mstate->dtms_getf != NULL &&
+		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+			/*
+			 * If we've done a getf() as a part of this ECB and we
+			 * don't have kernel access (and we're not in the global
+			 * zone), check if the path we cleaned up begins with
+			 * the zone's root path, and trim it off if so.  Note
+			 * that this is an output cleanliness issue, not a
+			 * security issue: knowing one's zone root path does
+			 * not enable privilege escalation.
+			 */
+			if (strstr(dest, z->zone_rootpath) == dest)
+				dest += strlen(z->zone_rootpath) - 1;
+		}
+#endif
+
 		regs[rd] = (uintptr_t)dest;
 		mstate->dtms_scratch_ptr += size;
 		break;
@@ -4748,7 +5710,7 @@
 			tryzero = -1;
 			numzero = 1;
 			for (i = 0; i < sizeof (struct in6_addr); i++) {
-#if defined(sun)
+#ifdef illumos
 				if (ip6._S6_un._S6_u8[i] == 0 &&
 #else
 				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4759,7 +5721,7 @@
 				}
 
 				if (tryzero != -1 &&
-#if defined(sun)
+#ifdef illumos
 				    (ip6._S6_un._S6_u8[i] != 0 ||
 #else
 				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
@@ -4775,7 +5737,7 @@
 					numzero = i - i % 2 - tryzero;
 					tryzero = -1;
 
-#if defined(sun)
+#ifdef illumos
 					if (ip6._S6_un._S6_u8[i] == 0 &&
 #else
 					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4796,7 +5758,7 @@
 				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
 					ASSERT(end >= base);
 
-#if defined(sun)
+#ifdef illumos
 					val = ip6._S6_un._S6_u8[i];
 #else
 					val = ip6.__u6_addr.__u6_addr8[i];
@@ -4841,7 +5803,7 @@
 				if (i < 14 && i != firstzero - 2)
 					*end-- = ':';
 
-#if defined(sun)
+#ifdef illumos
 				val = (ip6._S6_un._S6_u8[i] << 8) +
 				    ip6._S6_un._S6_u8[i + 1];
 #else
@@ -4887,6 +5849,45 @@
 		break;
 	}
 
+#ifndef illumos
+	case DIF_SUBR_MEMSTR: {
+		char *str = (char *)mstate->dtms_scratch_ptr;
+		uintptr_t mem = tupregs[0].dttk_value;
+		char c = tupregs[1].dttk_value;
+		size_t size = tupregs[2].dttk_value;
+		uint8_t n;
+		int i;
+
+		regs[rd] = 0;
+
+		if (size == 0)
+			break;
+
+		if (!dtrace_canload(mem, size - 1, mstate, vstate))
+			break;
+
+		if (!DTRACE_INSCRATCH(mstate, size)) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+			break;
+		}
+
+		if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
+			*flags |= CPU_DTRACE_ILLOP;
+			break;
+		}
+
+		for (i = 0; i < size - 1; i++) {
+			n = dtrace_load8(mem++);
+			str[i] = (n == 0) ? c : n;
+		}
+		str[size - 1] = 0;
+
+		regs[rd] = (uintptr_t)str;
+		mstate->dtms_scratch_ptr += size;
+		break;
+	}
+#endif
+
 	case DIF_SUBR_TYPEREF: {
 		uintptr_t size = 4 * sizeof(uintptr_t);
 		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
@@ -5077,102 +6078,95 @@
 				pc = DIF_INSTR_LABEL(instr);
 			break;
 		case DIF_OP_RLDSB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSB:
 			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDSH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSH:
 			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDSW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSW:
 			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDUB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUB:
 			regs[rd] = dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDUH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUH:
 			regs[rd] = dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDUW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUW:
 			regs[rd] = dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDX:
-			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDX:
 			regs[rd] = dtrace_load64(regs[r1]);
 			break;
 		case DIF_OP_ULDSB:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int8_t)
 			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDSH:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int16_t)
 			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDSW:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int32_t)
 			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUB:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUH:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUW:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDX:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_RET:
 			rval = regs[rd];
@@ -5487,6 +6481,11 @@
 				    regs[r2] ? regs[r2] :
 				    dtrace_strsize_default) + 1;
 			} else {
+				if (regs[r2] > LONG_MAX) {
+					*flags |= CPU_DTRACE_ILLOP;
+					break;
+				}
+
 				tupregs[ttop].dttk_size = regs[r2];
 			}
 
@@ -5755,7 +6754,7 @@
 	c[i++] = ')';
 	c[i] = '\0';
 
-#if defined(sun)
+#ifdef illumos
 	debug_enter(c);
 #else
 	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
@@ -5802,7 +6801,7 @@
 		return;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * raise() has a queue depth of 1 -- we ignore all subsequent
 	 * invocations of the raise() action.
@@ -5826,7 +6825,7 @@
 	if (dtrace_destructive_disallow)
 		return;
 
-#if defined(sun)
+#ifdef illumos
 	if (!curthread->t_dtrace_stop) {
 		curthread->t_dtrace_stop = 1;
 		curthread->t_sig_check = 1;
@@ -5845,7 +6844,7 @@
 {
 	hrtime_t now;
 	volatile uint16_t *flags;
-#if defined(sun)
+#ifdef illumos
 	cpu_t *cpu = CPU;
 #else
 	cpu_t *cpu = &solaris_cpu[curcpu];
@@ -5854,7 +6853,7 @@
 	if (dtrace_destructive_disallow)
 		return;
 
-	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
+	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
 
 	now = dtrace_gethrtime();
 
@@ -6002,6 +7001,63 @@
 	mstate->dtms_scratch_ptr = old;
 }
 
+static void
+dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
+    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
+{
+	volatile uint16_t *flags;
+	uint64_t val = *valp;
+	size_t valoffs = *valoffsp;
+
+	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
+
+	/*
+	 * If this is a string, we're going to only load until we find the zero
+	 * byte -- after which we'll store zero bytes.
+	 */
+	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
+		char c = '\0' + 1;
+		size_t s;
+
+		for (s = 0; s < size; s++) {
+			if (c != '\0' && dtkind == DIF_TF_BYREF) {
+				c = dtrace_load8(val++);
+			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				c = dtrace_fuword8((void *)(uintptr_t)val++);
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+				if (*flags & CPU_DTRACE_FAULT)
+					break;
+			}
+
+			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
+
+			if (c == '\0' && intuple)
+				break;
+		}
+	} else {
+		uint8_t c;
+		while (valoffs < end) {
+			if (dtkind == DIF_TF_BYREF) {
+				c = dtrace_load8(val++);
+			} else if (dtkind == DIF_TF_BYUREF) {
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				c = dtrace_fuword8((void *)(uintptr_t)val++);
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+				if (*flags & CPU_DTRACE_FAULT)
+					break;
+			}
+
+			DTRACE_STORE(uint8_t, tomax,
+			    valoffs++, c);
+		}
+	}
+
+	*valp = val;
+	*valoffsp = valoffs;
+}
+
 /*
  * If you're looking for the epicenter of DTrace, you just found it.  This
  * is the function called by the provider to fire a probe -- from which all
@@ -6026,7 +7082,7 @@
 	if (panicstr != NULL)
 		return;
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * Kick out immediately if this CPU is still being born (in which case
 	 * curthread will be set to -1) or the current thread can't allow
@@ -6051,7 +7107,7 @@
 		return;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	if (panic_quiesce) {
 #else
 	if (panicstr != NULL) {
@@ -6063,7 +7119,8 @@
 		return;
 	}
 
-	now = dtrace_gethrtime();
+	now = mstate.dtms_timestamp = dtrace_gethrtime();
+	mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
 	vtime = dtrace_vtime_references != 0;
 
 	if (vtime && curthread->t_dtrace_start)
@@ -6104,6 +7161,8 @@
 		uint64_t val = 0;
 
 		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
+		mstate.dtms_getf = NULL;
+
 		*flags &= ~CPU_DTRACE_ERROR;
 
 		if (prov == dtrace_provider) {
@@ -6155,7 +7214,7 @@
 			    probe->dtpr_id, probe->dtpr_arg) == 0)
 				continue;
 
-#if defined(sun)
+#ifdef illumos
 			/*
 			 * This is more subtle than it looks. We have to be
 			 * absolutely certain that CRED() isn't going to
@@ -6206,7 +7265,7 @@
 		if (now - state->dts_alive > dtrace_deadman_timeout) {
 			/*
 			 * We seem to be dead.  Unless we (a) have kernel
-			 * destructive permissions (b) have expicitly enabled
+			 * destructive permissions (b) have explicitly enabled
 			 * destructive actions and (c) destructive actions have
 			 * not been disabled, we're going to transition into
 			 * the KILLED state, from which no further processing
@@ -6234,8 +7293,18 @@
 		tomax = buf->dtb_tomax;
 		ASSERT(tomax != NULL);
 
-		if (ecb->dte_size != 0)
-			DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+		if (ecb->dte_size != 0) {
+			dtrace_rechdr_t dtrh;
+			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+				mstate.dtms_timestamp = dtrace_gethrtime();
+				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+			}
+			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
+			dtrh.dtrh_epid = ecb->dte_epid;
+			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
+			    mstate.dtms_timestamp);
+			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
+		}
 
 		mstate.dtms_epid = ecb->dte_epid;
 		mstate.dtms_present |= DTRACE_MSTATE_EPID;
@@ -6247,7 +7316,7 @@
 
 		if (pred != NULL) {
 			dtrace_difo_t *dp = pred->dtp_difo;
-			int rval;
+			uint64_t rval;
 
 			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
 
@@ -6382,7 +7451,9 @@
 				continue;
 
 			switch (act->dta_kind) {
-			case DTRACEACT_SPECULATE:
+			case DTRACEACT_SPECULATE: {
+				dtrace_rechdr_t *dtrh;
+
 				ASSERT(buf == &state->dts_buffer[cpuid]);
 				buf = dtrace_speculation_buffer(state,
 				    cpuid, val);
@@ -6404,10 +7475,23 @@
 				tomax = buf->dtb_tomax;
 				ASSERT(tomax != NULL);
 
-				if (ecb->dte_size != 0)
-					DTRACE_STORE(uint32_t, tomax, offs,
-					    ecb->dte_epid);
+				if (ecb->dte_size == 0)
+					continue;
+
+				ASSERT3U(ecb->dte_size, >=,
+				    sizeof (dtrace_rechdr_t));
+				dtrh = ((void *)(tomax + offs));
+				dtrh->dtrh_epid = ecb->dte_epid;
+				/*
+				 * When the speculation is committed, all of
+				 * the records in the speculative buffer will
+				 * have their timestamps set to the commit
+				 * time.  Until then, it is set to a sentinel
+				 * value, for debugability.
+				 */
+				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
 				continue;
+			}
 
 			case DTRACEACT_PRINTM: {
 				/* The DIF returns a 'memref'. */
@@ -6555,7 +7639,7 @@
 			case DTRACEACT_USYM:
 			case DTRACEACT_UMOD:
 			case DTRACEACT_UADDR: {
-#if defined(sun)
+#ifdef illumos
 				struct pid *pid = curthread->t_procp->p_pidp;
 #endif
 
@@ -6563,7 +7647,7 @@
 					continue;
 
 				DTRACE_STORE(uint64_t, tomax,
-#if defined(sun)
+#ifdef illumos
 				    valoffs, (uint64_t)pid->pid_id);
 #else
 				    valoffs, (uint64_t) curproc->p_pid);
@@ -6612,7 +7696,8 @@
 				ASSERT(0);
 			}
 
-			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
+			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
+			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
 				uintptr_t end = valoffs + size;
 
 				if (tracememsize != 0 &&
@@ -6621,40 +7706,15 @@
 					tracememsize = 0;
 				}
 
-				if (!dtrace_vcanload((void *)(uintptr_t)val,
+				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
+				    !dtrace_vcanload((void *)(uintptr_t)val,
 				    &dp->dtdo_rtype, &mstate, vstate))
 					continue;
 
-				/*
-				 * If this is a string, we're going to only
-				 * load until we find the zero byte -- after
-				 * which we'll store zero bytes.
-				 */
-				if (dp->dtdo_rtype.dtdt_kind ==
-				    DIF_TYPE_STRING) {
-					char c = '\0' + 1;
-					int intuple = act->dta_intuple;
-					size_t s;
-
-					for (s = 0; s < size; s++) {
-						if (c != '\0')
-							c = dtrace_load8(val++);
-
-						DTRACE_STORE(uint8_t, tomax,
-						    valoffs++, c);
-
-						if (c == '\0' && intuple)
-							break;
-					}
-
-					continue;
-				}
-
-				while (valoffs < end) {
-					DTRACE_STORE(uint8_t, tomax, valoffs++,
-					    dtrace_load8(val++));
-				}
-
+				dtrace_store_by_ref(dp, tomax, size, &valoffs,
+				    &val, end, act->dta_intuple,
+				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
+				    DIF_TF_BYREF: DIF_TF_BYUREF);
 				continue;
 			}
 
@@ -7012,7 +8072,7 @@
 {
 	uint32_t priv;
 
-#if defined(sun)
+#ifdef illumos
 	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
 		/*
 		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
@@ -7606,7 +8666,7 @@
 		 * already held.
 		 */
 		ASSERT(old == dtrace_provider);
-#if defined(sun)
+#ifdef illumos
 		ASSERT(dtrace_devi != NULL);
 #endif
 		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -7621,7 +8681,9 @@
 		}
 	} else {
 		mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 		mutex_enter(&mod_lock);
+#endif
 		mutex_enter(&dtrace_lock);
 	}
 
@@ -7635,7 +8697,9 @@
 	    dtrace_anon.dta_state->dts_necbs > 0))) {
 		if (!self) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 		}
 		return (EBUSY);
@@ -7669,7 +8733,9 @@
 
 		if (!self) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 		}
 
@@ -7723,7 +8789,7 @@
 		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
-#if defined(sun)
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
 #else
 		free_unr(dtrace_arena, probe->dtpr_id);
@@ -7732,7 +8798,7 @@
 	}
 
 	if ((prev = dtrace_provider) == old) {
-#if defined(sun)
+#ifdef illumos
 		ASSERT(self || dtrace_devi == NULL);
 		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
 #endif
@@ -7751,7 +8817,9 @@
 
 	if (!self) {
 		mutex_exit(&dtrace_lock);
+#ifdef illumos
 		mutex_exit(&mod_lock);
+#endif
 		mutex_exit(&dtrace_provider_lock);
 	}
 
@@ -7842,7 +8910,7 @@
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
 		kmem_free(probe, sizeof (dtrace_probe_t));
-#if defined(sun)
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
 #else
 		free_unr(dtrace_arena, i + 1);
@@ -7882,7 +8950,7 @@
 		mutex_enter(&dtrace_lock);
 	}
 
-#if defined(sun)
+#ifdef illumos
 	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
 	    VM_BESTFIT | VM_SLEEP);
 #else
@@ -8035,19 +9103,6 @@
 	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
 }
 
-#if !defined(sun)
-static int
-dtrace_probe_provide_cb(linker_file_t lf, void *arg)
-{
-	dtrace_provider_t *prv = (dtrace_provider_t *) arg;
-
-	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf);
-
-	return(0);
-}
-#endif
-
-
 /*
  * Called to indicate that a probe -- or probes -- should be provided by a
  * specfied provider.  If the specified description is NULL, the provider will
@@ -8066,7 +9121,7 @@
 static void
 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
 {
-#if defined(sun)
+#ifdef illumos
 	modctl_t *ctl;
 #endif
 	int all = 0;
@@ -8084,6 +9139,7 @@
 		 */
 		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
 
+#ifdef illumos
 		/*
 		 * Now call the per-module provide operation.  We will grab
 		 * mod_lock to prevent the list from being modified.  Note
@@ -8092,7 +9148,6 @@
 		 */
 		mutex_enter(&mod_lock);
 
-#if defined(sun)
 		ctl = &modules;
 		do {
 			if (ctl->mod_busy || ctl->mod_mp == NULL)
@@ -8101,15 +9156,13 @@
 			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
 
 		} while ((ctl = ctl->mod_next) != &modules);
-#else
-		(void) linker_file_foreach(dtrace_probe_provide_cb, prv);
-#endif
 
 		mutex_exit(&mod_lock);
+#endif
 	} while (all && (prv = prv->dtpv_next) != NULL);
 }
 
-#if defined(sun)
+#ifdef illumos
 /*
  * Iterate over each probe, and call the Framework-to-Provider API function
  * denoted by offs.
@@ -8270,6 +9323,10 @@
 		probe = (dof_probe_t *)(uintptr_t)(daddr +
 		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
 
+		/* See the check in dtrace_helper_provider_validate(). */
+		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
+			continue;
+
 		dhpb.dthpb_mod = dhp->dofhp_mod;
 		dhpb.dthpb_func = strtab + probe->dofpr_func;
 		dhpb.dthpb_name = strtab + probe->dofpr_name;
@@ -8759,6 +9816,20 @@
 			    subr == DIF_SUBR_COPYOUTSTR) {
 				dp->dtdo_destructive = 1;
 			}
+
+			if (subr == DIF_SUBR_GETF) {
+				/*
+				 * If we have a getf() we need to record that
+				 * in our state.  Note that our state can be
+				 * NULL if this is a helper -- but in that
+				 * case, the call to getf() is itself illegal,
+				 * and will be caught (slightly later) when
+				 * the helper is validated.
+				 */
+				if (vstate->dtvs_state != NULL)
+					vstate->dtvs_state->dts_getf++;
+			}
+
 			break;
 		case DIF_OP_PUSHTR:
 			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
@@ -8788,7 +9859,7 @@
 		    "expected 'ret' as last DIF instruction\n");
 	}
 
-	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
+	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
 		/*
 		 * If we're not returning by reference, the size must be either
 		 * 0 or the size of one of the base types.
@@ -8802,7 +9873,7 @@
 			break;
 
 		default:
-			err += efunc(dp->dtdo_len - 1, "bad return size");
+			err += efunc(dp->dtdo_len - 1, "bad return size\n");
 		}
 	}
 
@@ -8876,9 +9947,10 @@
 				break;
 			}
 
-			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
-			    vt->dtdt_size > dtrace_global_maxsize) {
-				err += efunc(i, "oversized by-ref global\n");
+			if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
+			    v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
+			    vt->dtdt_size > dtrace_statvar_maxsize) {
+				err += efunc(i, "oversized by-ref static\n");
 				break;
 			}
 		}
@@ -9038,7 +10110,9 @@
 			    subr == DIF_SUBR_INET_NTOA ||
 			    subr == DIF_SUBR_INET_NTOA6 ||
 			    subr == DIF_SUBR_INET_NTOP ||
+			    subr == DIF_SUBR_JSON ||
 			    subr == DIF_SUBR_LLTOSTR ||
+			    subr == DIF_SUBR_STRTOLL ||
 			    subr == DIF_SUBR_RINDEX ||
 			    subr == DIF_SUBR_STRCHR ||
 			    subr == DIF_SUBR_STRJOIN ||
@@ -9051,6 +10125,9 @@
 			    subr == DIF_SUBR_NTOHL ||
 			    subr == DIF_SUBR_NTOHLL ||
 			    subr == DIF_SUBR_MEMREF ||
+#ifndef illumos
+			    subr == DIF_SUBR_MEMSTR ||
+#endif
 			    subr == DIF_SUBR_TYPEREF)
 				break;
 
@@ -9217,6 +10294,9 @@
 				if (srd == 0)
 					return;
 
+				if (sval > LONG_MAX)
+					return;
+
 				tupregs[ttop++].dttk_size = sval;
 			}
 
@@ -9278,6 +10358,19 @@
 		 */
 		size = P2ROUNDUP(size, sizeof (uint64_t));
 
+		/*
+		 * Before setting the chunk size, check that we're not going
+		 * to set it to a negative value...
+		 */
+		if (size > LONG_MAX)
+			return;
+
+		/*
+		 * ...and make certain that we didn't badly overflow.
+		 */
+		if (size < ksize || size < sizeof (dtrace_dynvar_t))
+			return;
+
 		if (size > vstate->dtvs_dynvars.dtds_chunksize)
 			vstate->dtvs_dynvars.dtds_chunksize = size;
 	}
@@ -9686,7 +10779,7 @@
 {
 	dtrace_actdesc_t *act;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
 	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
 #endif
@@ -9725,7 +10818,7 @@
 	if (DTRACEACT_ISPRINTFLIKE(kind)) {
 		char *str = (char *)(uintptr_t)act->dtad_arg;
 
-#if defined(sun)
+#ifdef illumos
 		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
 		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
 #endif
@@ -9754,9 +10847,9 @@
 
 	/*
 	 * The default size is the size of the default action: recording
-	 * the epid.
+	 * the header.
 	 */
-	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
 	ecb->dte_alignment = sizeof (dtrace_epid_t);
 
 	epid = state->dts_epid++;
@@ -9854,122 +10947,89 @@
 static void
 dtrace_ecb_resize(dtrace_ecb_t *ecb)
 {
-	uint32_t maxalign = sizeof (dtrace_epid_t);
-	uint32_t align = sizeof (uint8_t), offs, diff;
 	dtrace_action_t *act;
-	int wastuple = 0;
+	uint32_t curneeded = UINT32_MAX;
 	uint32_t aggbase = UINT32_MAX;
-	dtrace_state_t *state = ecb->dte_state;
 
 	/*
-	 * If we record anything, we always record the epid.  (And we always
-	 * record it first.)
+	 * If we record anything, we always record the dtrace_rechdr_t.  (And
+	 * we always record it first.)
 	 */
-	offs = sizeof (dtrace_epid_t);
-	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+	ecb->dte_size = sizeof (dtrace_rechdr_t);
+	ecb->dte_alignment = sizeof (dtrace_epid_t);
 
 	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
 		dtrace_recdesc_t *rec = &act->dta_rec;
+		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
 
-		if ((align = rec->dtrd_alignment) > maxalign)
-			maxalign = align;
+		ecb->dte_alignment = MAX(ecb->dte_alignment,
+		    rec->dtrd_alignment);
 
-		if (!wastuple && act->dta_intuple) {
-			/*
-			 * This is the first record in a tuple.  Align the
-			 * offset to be at offset 4 in an 8-byte aligned
-			 * block.
-			 */
-			diff = offs + sizeof (dtrace_aggid_t);
-
-			if ((diff = (diff & (sizeof (uint64_t) - 1))))
-				offs += sizeof (uint64_t) - diff;
-
-			aggbase = offs - sizeof (dtrace_aggid_t);
-			ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
-		}
-
-		/*LINTED*/
-		if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
-			/*
-			 * The current offset is not properly aligned; align it.
-			 */
-			offs += align - diff;
-		}
-
-		rec->dtrd_offset = offs;
-
-		if (offs + rec->dtrd_size > ecb->dte_needed) {
-			ecb->dte_needed = offs + rec->dtrd_size;
-
-			if (ecb->dte_needed > state->dts_needed)
-				state->dts_needed = ecb->dte_needed;
-		}
-
 		if (DTRACEACT_ISAGG(act->dta_kind)) {
 			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
-			dtrace_action_t *first = agg->dtag_first, *prev;
 
-			ASSERT(rec->dtrd_size != 0 && first != NULL);
-			ASSERT(wastuple);
+			ASSERT(rec->dtrd_size != 0);
+			ASSERT(agg->dtag_first != NULL);
+			ASSERT(act->dta_prev->dta_intuple);
 			ASSERT(aggbase != UINT32_MAX);
+			ASSERT(curneeded != UINT32_MAX);
 
 			agg->dtag_base = aggbase;
 
-			while ((prev = first->dta_prev) != NULL &&
-			    DTRACEACT_ISAGG(prev->dta_kind)) {
-				agg = (dtrace_aggregation_t *)prev;
-				first = agg->dtag_first;
-			}
+			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+			rec->dtrd_offset = curneeded;
+			curneeded += rec->dtrd_size;
+			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
 
-			if (prev != NULL) {
-				offs = prev->dta_rec.dtrd_offset +
-				    prev->dta_rec.dtrd_size;
-			} else {
-				offs = sizeof (dtrace_epid_t);
+			aggbase = UINT32_MAX;
+			curneeded = UINT32_MAX;
+		} else if (act->dta_intuple) {
+			if (curneeded == UINT32_MAX) {
+				/*
+				 * This is the first record in a tuple.  Align
+				 * curneeded to be at offset 4 in an 8-byte
+				 * aligned block.
+				 */
+				ASSERT(act->dta_prev == NULL ||
+				    !act->dta_prev->dta_intuple);
+				ASSERT3U(aggbase, ==, UINT32_MAX);
+				curneeded = P2PHASEUP(ecb->dte_size,
+				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+				aggbase = curneeded - sizeof (dtrace_aggid_t);
+				ASSERT(IS_P2ALIGNED(aggbase,
+				    sizeof (uint64_t)));
 			}
-			wastuple = 0;
+			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+			rec->dtrd_offset = curneeded;
+			curneeded += rec->dtrd_size;
 		} else {
-			if (!act->dta_intuple)
-				ecb->dte_size = offs + rec->dtrd_size;
+			/* tuples must be followed by an aggregation */
+			ASSERT(act->dta_prev == NULL ||
+			    !act->dta_prev->dta_intuple);
 
-			offs += rec->dtrd_size;
+			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
+			    rec->dtrd_alignment);
+			rec->dtrd_offset = ecb->dte_size;
+			ecb->dte_size += rec->dtrd_size;
+			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
 		}
-
-		wastuple = act->dta_intuple;
 	}
 
 	if ((act = ecb->dte_action) != NULL &&
 	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
-	    ecb->dte_size == sizeof (dtrace_epid_t)) {
+	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
 		/*
-		 * If the size is still sizeof (dtrace_epid_t), then all
+		 * If the size is still sizeof (dtrace_rechdr_t), then all
 		 * actions store no data; set the size to 0.
 		 */
-		ecb->dte_alignment = maxalign;
 		ecb->dte_size = 0;
-
-		/*
-		 * If the needed space is still sizeof (dtrace_epid_t), then
-		 * all actions need no additional space; set the needed
-		 * size to 0.
-		 */
-		if (ecb->dte_needed == sizeof (dtrace_epid_t))
-			ecb->dte_needed = 0;
-
-		return;
 	}
 
-	/*
-	 * Set our alignment, and make sure that the dte_size and dte_needed
-	 * are aligned to the size of an EPID.
-	 */
-	ecb->dte_alignment = maxalign;
-	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
-	    ~(sizeof (dtrace_epid_t) - 1);
-	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
-	    ~(sizeof (dtrace_epid_t) - 1);
-	ASSERT(ecb->dte_size <= ecb->dte_needed);
+	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
+	    ecb->dte_needed);
 }
 
 static dtrace_action_t *
@@ -10117,7 +11177,7 @@
 	/*
 	 * We need to allocate an id for this aggregation.
 	 */
-#if defined(sun)
+#ifdef illumos
 	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
 	    VM_BESTFIT | VM_SLEEP);
 #else
@@ -10171,7 +11231,7 @@
 	dtrace_aggid_t aggid = agg->dtag_id;
 
 	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
-#if defined(sun)
+#ifdef illumos
 	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
 #else
 	free_unr(state->dts_aggid_arena, aggid);
@@ -10240,7 +11300,7 @@
 				format = 0;
 			} else {
 				ASSERT(arg != 0);
-#if defined(sun)
+#ifdef illumos
 				ASSERT(arg > KERNELBASE);
 #endif
 				format = dtrace_format_add(state,
@@ -10349,7 +11409,7 @@
 			break;
 
 		case DTRACEACT_SPECULATE:
-			if (ecb->dte_size > sizeof (dtrace_epid_t))
+			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
 				return (EINVAL);
 
 			if (dp == NULL)
@@ -10470,7 +11530,7 @@
 
 	ecb->dte_action = NULL;
 	ecb->dte_action_last = NULL;
-	ecb->dte_size = sizeof (dtrace_epid_t);
+	ecb->dte_size = 0;
 }
 
 static void
@@ -10739,12 +11799,13 @@
 	caddr_t tomax = buf->dtb_tomax;
 	caddr_t xamot = buf->dtb_xamot;
 	dtrace_icookie_t cookie;
-	hrtime_t now = dtrace_gethrtime();
+	hrtime_t now;
 
 	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
 	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
 
 	cookie = dtrace_interrupt_disable();
+	now = dtrace_gethrtime();
 	buf->dtb_tomax = xamot;
 	buf->dtb_xamot = tomax;
 	buf->dtb_xamot_drops = buf->dtb_drops;
@@ -10789,17 +11850,20 @@
 
 static int
 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
-    processorid_t cpu)
+    processorid_t cpu, int *factor)
 {
-#if defined(sun)
+#ifdef illumos
 	cpu_t *cp;
 #endif
 	dtrace_buffer_t *buf;
+	int allocated = 0, desired = 0;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 
+	*factor = 1;
+
 	if (size > dtrace_nonroot_maxsize &&
 	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
 		return (EFBIG);
@@ -10823,7 +11887,8 @@
 
 		ASSERT(buf->dtb_xamot == NULL);
 
-		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_tomax = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 
 		buf->dtb_size = size;
@@ -10834,7 +11899,8 @@
 		if (flags & DTRACEBUF_NOSWITCH)
 			continue;
 
-		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_xamot = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 	} while ((cp = cp->cpu_next) != cpu_list);
 
@@ -10848,16 +11914,19 @@
 			continue;
 
 		buf = &bufs[cp->cpu_id];
+		desired += 2;
 
 		if (buf->dtb_xamot != NULL) {
 			ASSERT(buf->dtb_tomax != NULL);
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_xamot, size);
+			allocated++;
 		}
 
 		if (buf->dtb_tomax != NULL) {
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_tomax, size);
+			allocated++;
 		}
 
 		buf->dtb_tomax = NULL;
@@ -10864,12 +11933,11 @@
 		buf->dtb_xamot = NULL;
 		buf->dtb_size = 0;
 	} while ((cp = cp->cpu_next) != cpu_list);
-
-	return (ENOMEM);
 #else
 	int i;
 
-#if defined(__amd64__)
+	*factor = 1;
+#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
 	/*
 	 * FreeBSD isn't good at limiting the amount of memory we
 	 * ask to malloc, so let's place a limit here before trying
@@ -10876,7 +11944,7 @@
 	 * to do something that might well end in tears at bedtime.
 	 */
 	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
-		return(ENOMEM);
+		return (ENOMEM);
 #endif
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -10898,7 +11966,8 @@
 
 		ASSERT(buf->dtb_xamot == NULL);
 
-		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_tomax = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 
 		buf->dtb_size = size;
@@ -10909,7 +11978,8 @@
 		if (flags & DTRACEBUF_NOSWITCH)
 			continue;
 
-		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_xamot = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 	}
 
@@ -10925,16 +11995,19 @@
 			continue;
 
 		buf = &bufs[i];
+		desired += 2;
 
 		if (buf->dtb_xamot != NULL) {
 			ASSERT(buf->dtb_tomax != NULL);
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_xamot, size);
+			allocated++;
 		}
 
 		if (buf->dtb_tomax != NULL) {
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_tomax, size);
+			allocated++;
 		}
 
 		buf->dtb_tomax = NULL;
@@ -10942,9 +12015,10 @@
 		buf->dtb_size = 0;
 
 	}
+#endif
+	*factor = desired / (allocated > 0 ? allocated : 1);
 
 	return (ENOMEM);
-#endif
 }
 
 /*
@@ -11110,7 +12184,7 @@
 			if (epid == DTRACE_EPIDNONE) {
 				size = sizeof (uint32_t);
 			} else {
-				ASSERT(epid <= state->dts_necbs);
+				ASSERT3U(epid, <=, state->dts_necbs);
 				ASSERT(state->dts_ecbs[epid - 1] != NULL);
 
 				size = state->dts_ecbs[epid - 1]->dte_size;
@@ -11434,6 +12508,7 @@
 		ASSERT(enab->dten_vstate->dtvs_state != NULL);
 		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
 		enab->dten_vstate->dtvs_state->dts_nretained--;
+		dtrace_retained_gen++;
 	}
 
 	if (enab->dten_prev == NULL) {
@@ -11476,6 +12551,7 @@
 		return (ENOSPC);
 
 	state->dts_nretained++;
+	dtrace_retained_gen++;
 
 	if (dtrace_retained == NULL) {
 		dtrace_retained = enab;
@@ -11653,10 +12729,11 @@
 	 * block pending our completion.
 	 */
 	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
-#if defined(sun)
+#ifdef illumos
 		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
 
-		if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr))
+		if (INGLOBALZONE(curproc) ||
+		    cr != NULL && getzoneid() == crgetzoneid(cr))
 #endif
 			(void) dtrace_enabling_match(enab, NULL);
 	}
@@ -11717,6 +12794,7 @@
 {
 	int i, all = 0;
 	dtrace_probedesc_t desc;
+	dtrace_genid_t gen;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -11727,15 +12805,25 @@
 	}
 
 	do {
-		dtrace_enabling_t *enab = dtrace_retained;
+		dtrace_enabling_t *enab;
 		void *parg = prv->dtpv_arg;
 
-		for (; enab != NULL; enab = enab->dten_next) {
+retry:
+		gen = dtrace_retained_gen;
+		for (enab = dtrace_retained; enab != NULL;
+		    enab = enab->dten_next) {
 			for (i = 0; i < enab->dten_ndesc; i++) {
 				desc = enab->dten_desc[i]->dted_probe;
 				mutex_exit(&dtrace_lock);
 				prv->dtpv_pops.dtps_provide(parg, &desc);
 				mutex_enter(&dtrace_lock);
+				/*
+				 * Process the retained enablings again if
+				 * they have changed while we weren't holding
+				 * dtrace_lock.
+				 */
+				if (gen != dtrace_retained_gen)
+					goto retry;
 			}
 		}
 	} while (all && (prv = prv->dtpv_next) != NULL);
@@ -11936,7 +13024,8 @@
 
 	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
 
-	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
+	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
+	    dof->dofh_loadsz != hdr.dofh_loadsz) {
 		kmem_free(dof, hdr.dofh_loadsz);
 		*errp = EFAULT;
 		return (NULL);
@@ -11945,7 +13034,7 @@
 	return (dof);
 }
 
-#if !defined(sun)
+#ifndef illumos
 static __inline uchar_t
 dtrace_dof_char(char c) {
 	switch (c) {
@@ -11988,7 +13077,7 @@
 	unsigned int len, i;
 	dof_hdr_t *dof;
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * Unfortunately, array of values in .conf files are always (and
 	 * only) interpreted to be integer arrays.  We must read our DOF
@@ -12746,10 +13835,17 @@
 			}
 		}
 
+		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
+		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
+			dtrace_dof_error(dof, "loadable section with load "
+			    "flag unset");
+			return (-1);
+		}
+
 		if (!(sec->dofs_flags & DOF_SECF_LOAD))
 			continue; /* just ignore non-loadable sections */
 
-		if (sec->dofs_align & (sec->dofs_align - 1)) {
+		if (!ISP2(sec->dofs_align)) {
 			dtrace_dof_error(dof, "bad section alignment");
 			return (-1);
 		}
@@ -12894,10 +13990,12 @@
 	if ((dstate->dtds_chunksize = chunksize) == 0)
 		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
 
+	VERIFY(dstate->dtds_chunksize < LONG_MAX);
+
 	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
 		size = min;
 
-	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 		return (ENOMEM);
 
 	dstate->dtds_size = size;
@@ -12934,10 +14032,13 @@
 	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
 	limit = (uintptr_t)base + size;
 
+	VERIFY((uintptr_t)start < limit);
+	VERIFY((uintptr_t)start >= (uintptr_t)base);
+
 	maxper = (limit - (uintptr_t)start) / NCPU;
 	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
 
-#if !defined(sun)
+#ifndef illumos
 	CPU_FOREACH(i) {
 #else
 	for (i = 0; i < NCPU; i++) {
@@ -12959,7 +14060,7 @@
 			start = (dtrace_dynvar_t *)limit;
 		}
 
-		ASSERT(limit <= (uintptr_t)base + size);
+		VERIFY(limit <= (uintptr_t)base + size);
 
 		for (;;) {
 			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
@@ -12968,6 +14069,8 @@
 			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
 				break;
 
+			VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
+			    (uintptr_t)dvar <= (uintptr_t)base + size);
 			dvar->dtdv_next = next;
 			dvar = next;
 		}
@@ -13017,7 +14120,7 @@
 	}
 }
 
-#if defined(sun)
+#ifdef illumos
 static void
 dtrace_state_clean(dtrace_state_t *state)
 {
@@ -13054,7 +14157,7 @@
 	dtrace_membar_producer();
 	state->dts_alive = now;
 }
-#else
+#else	/* !illumos */
 static void
 dtrace_state_clean(void *arg)
 {
@@ -13103,16 +14206,16 @@
 	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
 	    dtrace_state_deadman, state);
 }
-#endif
+#endif	/* illumos */
 
 static dtrace_state_t *
-#if defined(sun)
+#ifdef illumos
 dtrace_state_create(dev_t *devp, cred_t *cr)
 #else
 dtrace_state_create(struct cdev *dev)
 #endif
 {
-#if defined(sun)
+#ifdef illumos
 	minor_t minor;
 	major_t major;
 #else
@@ -13127,7 +14230,7 @@
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
-#if defined(sun)
+#ifdef illumos
 	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
 	    VM_BESTFIT | VM_SLEEP);
 
@@ -13141,7 +14244,7 @@
 	if (dev != NULL) {
 		cr = dev->si_cred;
 		m = dev2unit(dev);
-		}
+	}
 
 	/* Allocate memory for the state. */
 	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
@@ -13150,7 +14253,7 @@
 	state->dts_epid = DTRACE_EPIDNONE + 1;
 
 	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
-#if defined(sun)
+#ifdef illumos
 	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
 	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
 
@@ -13178,12 +14281,12 @@
 	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
 	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
 
-#if defined(sun)
+#ifdef illumos
 	state->dts_cleaner = CYCLIC_NONE;
 	state->dts_deadman = CYCLIC_NONE;
 #else
-	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
-	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
+	callout_init(&state->dts_cleaner, 1);
+	callout_init(&state->dts_deadman, 1);
 #endif
 	state->dts_vstate.dtvs_state = state;
 
@@ -13268,7 +14371,7 @@
 			 * we can do destructive things to processes which
 			 * have altered credentials.
 			 */
-#if defined(sun)
+#ifdef illumos
 			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
 			    cr->cr_zone->zone_privset)) {
 				state->dts_cred.dcr_action |=
@@ -13310,7 +14413,7 @@
 				state->dts_cred.dcr_action |=
 				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
 
-#if defined(sun)
+#ifdef illumos
 			/*
 			 * If we have all privs in whatever zone this is,
 			 * we can do destructive things to processes which
@@ -13349,7 +14452,7 @@
 {
 	dtrace_optval_t *opt = state->dts_options, size;
 	processorid_t cpu = 0;;
-	int flags = 0, rval;
+	int flags = 0, rval, factor, divisor = 1;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&cpu_lock));
@@ -13379,7 +14482,7 @@
 			flags |= DTRACEBUF_INACTIVE;
 	}
 
-	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
+	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
 		/*
 		 * The size must be 8-byte aligned.  If the size is not 8-byte
 		 * aligned, drop it down by the difference.
@@ -13397,7 +14500,7 @@
 			return (E2BIG);
 		}
 
-		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
+		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
 
 		if (rval != ENOMEM) {
 			opt[which] = size;
@@ -13406,6 +14509,9 @@
 
 		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
 			return (rval);
+
+		for (divisor = 2; divisor < factor; divisor <<= 1)
+			continue;
 	}
 
 	return (ENOMEM);
@@ -13466,7 +14572,7 @@
 	dtrace_optval_t *opt = state->dts_options, sz, nspec;
 	dtrace_speculation_t *spec;
 	dtrace_buffer_t *buf;
-#if defined(sun)
+#ifdef illumos
 	cyc_handler_t hdlr;
 	cyc_time_t when;
 #endif
@@ -13507,7 +14613,8 @@
 		goto out;
 	}
 
-	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
+	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
+	    KM_NOSLEEP | KM_NORMALPRI);
 
 	if (spec == NULL) {
 		rval = ENOMEM;
@@ -13518,7 +14625,8 @@
 	state->dts_nspeculations = (int)nspec;
 
 	for (i = 0; i < nspec; i++) {
-		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
+		if ((buf = kmem_zalloc(bufsize,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 			rval = ENOMEM;
 			goto err;
 		}
@@ -13648,7 +14756,7 @@
 		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
 
 	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
-#if defined(sun)
+#ifdef illumos
 	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
 	hdlr.cyh_arg = state;
 	hdlr.cyh_level = CY_LOW_LEVEL;
@@ -13675,6 +14783,24 @@
 
 	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
 
+#ifdef illumos
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to bump our zone's count, and (if
+		 * this is the first enabling to have an unprivileged call
+		 * to getf()) we need to hook into closef().
+		 */
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+		if (dtrace_getf++ == 0) {
+			ASSERT(dtrace_closef == NULL);
+			dtrace_closef = dtrace_getf_barrier;
+		}
+	}
+#endif
+
 	/*
 	 * Now it's time to actually fire the BEGIN probe.  We need to disable
 	 * interrupts here both to record the CPU on which we fired the BEGIN
@@ -13791,6 +14917,26 @@
 	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
 	dtrace_sync();
 
+#ifdef illumos
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to lower our zone's count, and (if
+		 * this is the last enabling to have an unprivileged call
+		 * to getf()) we need to clear the closef() hook.
+		 */
+		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+		ASSERT(dtrace_closef == dtrace_getf_barrier);
+		ASSERT(dtrace_getf > 0);
+
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+		if (--dtrace_getf == 0)
+			dtrace_closef = NULL;
+	}
+#endif
+
 	return (0);
 }
 
@@ -13854,7 +15000,7 @@
 {
 	dtrace_ecb_t *ecb;
 	dtrace_vstate_t *vstate = &state->dts_vstate;
-#if defined(sun)
+#ifdef illumos
 	minor_t minor = getminor(state->dts_dev);
 #endif
 	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
@@ -13932,7 +15078,7 @@
 	for (i = 0; i < nspec; i++)
 		dtrace_buffer_free(spec[i].dtsp_buffer);
 
-#if defined(sun)
+#ifdef illumos
 	if (state->dts_cleaner != CYCLIC_NONE)
 		cyclic_remove(state->dts_cleaner);
 
@@ -13972,7 +15118,7 @@
 	dtrace_format_destroy(state);
 
 	if (state->dts_aggid_arena != NULL) {
-#if defined(sun)
+#ifdef illumos
 		vmem_destroy(state->dts_aggid_arena);
 #else
 		delete_unrhdr(state->dts_aggid_arena);
@@ -13979,7 +15125,7 @@
 #endif
 		state->dts_aggid_arena = NULL;
 	}
-#if defined(sun)
+#ifdef illumos
 	ddi_soft_state_free(dtrace_softstate, minor);
 	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
 #endif
@@ -14031,7 +15177,7 @@
 			break;
 		}
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * We want to create anonymous state, so we need to transition
 		 * the kernel debugger to indicate that DTrace is active.  If
@@ -14050,7 +15196,7 @@
 		 * If we haven't allocated an anonymous state, we'll do so now.
 		 */
 		if ((state = dtrace_anon.dta_state) == NULL) {
-#if defined(sun)
+#ifdef illumos
 			state = dtrace_state_create(NULL, NULL);
 #else
 			state = dtrace_state_create(NULL);
@@ -14123,10 +15269,10 @@
     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
 {
 	uint32_t size, next, nnext, i;
-	dtrace_helptrace_t *ent;
+	dtrace_helptrace_t *ent, *buffer;
 	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
 
-	if (!dtrace_helptrace_enabled)
+	if ((buffer = dtrace_helptrace_buffer) == NULL)
 		return;
 
 	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
@@ -14154,10 +15300,12 @@
 	/*
 	 * We have our slot; fill it in.
 	 */
-	if (nnext == size)
+	if (nnext == size) {
+		dtrace_helptrace_wrapped++;
 		next = 0;
+	}
 
-	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
+	ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
 	ent->dtht_helper = helper;
 	ent->dtht_where = where;
 	ent->dtht_nlocals = vstate->dtvs_nlocals;
@@ -14191,7 +15339,7 @@
 	dtrace_helper_action_t *helper;
 	dtrace_vstate_t *vstate;
 	dtrace_difo_t *pred;
-	int i, trace = dtrace_helptrace_enabled;
+	int i, trace = dtrace_helptrace_buffer != NULL;
 
 	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
 
@@ -14552,8 +15700,8 @@
 	 * Check to make sure this isn't a duplicate.
 	 */
 	for (i = 0; i < help->dthps_nprovs; i++) {
-		if (dofhp->dofhp_addr ==
-		    help->dthps_provs[i]->dthp_prov.dofhp_addr)
+		if (dofhp->dofhp_dof ==
+		    help->dthps_provs[i]->dthp_prov.dofhp_dof)
 			return (EALREADY);
 	}
 
@@ -14713,7 +15861,13 @@
 
 		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
 			dtrace_dof_error(dof, "function name too long");
-			return (-1);
+			/*
+			 * Keep going if the function name is too long.
+			 * Unlike provider and probe names, we cannot reasonably
+			 * impose restrictions on function names, since they're
+			 * a property of the code being instrumented. We will
+			 * skip this probe in dtrace_helper_provide_one().
+			 */
 		}
 
 		if (probe->dofpr_name >= str_sec->dofs_size ||
@@ -14939,7 +16093,7 @@
 	return (help);
 }
 
-#if defined(sun)
+#ifdef illumos
 static
 #endif
 void
@@ -14947,7 +16101,7 @@
 {
 	dtrace_helpers_t *help;
 	dtrace_vstate_t *vstate;
-#if defined(sun)
+#ifdef illumos
 	proc_t *p = curproc;
 #endif
 	int i;
@@ -15036,7 +16190,7 @@
 	mutex_exit(&dtrace_lock);
 }
 
-#if defined(sun)
+#ifdef illumos
 static
 #endif
 void
@@ -15121,7 +16275,6 @@
 		dtrace_helper_provider_register(to, newhelp, NULL);
 }
 
-#if defined(sun)
 /*
  * DTrace Hook Functions
  */
@@ -15131,9 +16284,13 @@
 	dtrace_provider_t *prv;
 
 	mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 	mutex_enter(&mod_lock);
+#endif
 
+#ifdef illumos
 	ASSERT(ctl->mod_busy);
+#endif
 
 	/*
 	 * We're going to call each providers per-module provide operation
@@ -15142,7 +16299,9 @@
 	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
 		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
 
+#ifdef illumos
 	mutex_exit(&mod_lock);
+#endif
 	mutex_exit(&dtrace_provider_lock);
 
 	/*
@@ -15179,17 +16338,48 @@
 }
 
 static void
+#ifdef illumos
 dtrace_module_unloaded(modctl_t *ctl)
+#else
+dtrace_module_unloaded(modctl_t *ctl, int *error)
+#endif
 {
 	dtrace_probe_t template, *probe, *first, *next;
 	dtrace_provider_t *prov;
+#ifndef illumos
+	char modname[DTRACE_MODNAMELEN];
+	size_t len;
+#endif
 
+#ifdef illumos
 	template.dtpr_mod = ctl->mod_modname;
+#else
+	/* Handle the fact that ctl->filename may end in ".ko". */
+	strlcpy(modname, ctl->filename, sizeof(modname));
+	len = strlen(ctl->filename);
+	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
+		modname[len - 3] = '\0';
+	template.dtpr_mod = modname;
+#endif
 
 	mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 	mutex_enter(&mod_lock);
+#endif
 	mutex_enter(&dtrace_lock);
 
+#ifndef illumos
+	if (ctl->nenabled > 0) {
+		/* Don't allow unloads if a probe is enabled. */
+		mutex_exit(&dtrace_provider_lock);
+		mutex_exit(&dtrace_lock);
+		*error = -1;
+		printf(
+	"kldunload: attempt to unload module that has DTrace probes enabled\n");
+		return;
+	}
+#endif
+
 	if (dtrace_bymod == NULL) {
 		/*
 		 * The DTrace module is loaded (obviously) but not attached;
@@ -15196,7 +16386,9 @@
 		 * we don't have any work to do.
 		 */
 		mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
 		mutex_exit(&mod_lock);
+#endif
 		mutex_exit(&dtrace_lock);
 		return;
 	}
@@ -15205,7 +16397,9 @@
 	    probe != NULL; probe = probe->dtpr_nextmod) {
 		if (probe->dtpr_ecb != NULL) {
 			mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_lock);
 
 			/*
@@ -15219,8 +16413,13 @@
 			 * probe, either.
 			 */
 			if (dtrace_err_verbose) {
+#ifdef illumos
 				cmn_err(CE_WARN, "unloaded module '%s' had "
 				    "enabled probes", ctl->mod_modname);
+#else
+				cmn_err(CE_WARN, "unloaded module '%s' had "
+				    "enabled probes", modname);
+#endif
 			}
 
 			return;
@@ -15263,16 +16462,42 @@
 		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
+#else
+		free_unr(dtrace_arena, probe->dtpr_id);
+#endif
 		kmem_free(probe, sizeof (dtrace_probe_t));
 	}
 
 	mutex_exit(&dtrace_lock);
+#ifdef illumos
 	mutex_exit(&mod_lock);
+#endif
 	mutex_exit(&dtrace_provider_lock);
 }
 
+#ifndef illumos
 static void
+dtrace_kld_load(void *arg __unused, linker_file_t lf)
+{
+
+	dtrace_module_loaded(lf);
+}
+
+static void
+dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
+{
+
+	if (*error != 0)
+		/* We already have an error, so don't do anything. */
+		return;
+	dtrace_module_unloaded(lf, error);
+}
+#endif
+
+#ifdef illumos
+static void
 dtrace_suspend(void)
 {
 	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
@@ -15344,7 +16569,7 @@
 	return (0);
 }
 
-#if defined(sun)
+#ifdef illumos
 static void
 dtrace_cpu_setup_initial(processorid_t cpu)
 {
@@ -15389,10 +16614,29 @@
 	dtrace_toxranges++;
 }
 
+static void
+dtrace_getf_barrier()
+{
+#ifdef illumos
+	/*
+	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+	 * that contain calls to getf(), this routine will be called on every
+	 * closef() before either the underlying vnode is released or the
+	 * file_t itself is freed.  By the time we are here, it is essential
+	 * that the file_t can no longer be accessed from a call to getf()
+	 * in probe context -- that assures that a dtrace_sync() can be used
+	 * to clear out any enablings referring to the old structures.
+	 */
+	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+	    kcred->cr_zone->zone_dtrace_getf != 0)
+		dtrace_sync();
+#endif
+}
+
 /*
  * DTrace Driver Cookbook Functions
  */
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
@@ -15504,17 +16748,6 @@
 	mutex_exit(&cpu_lock);
 
 	/*
-	 * If DTrace helper tracing is enabled, we need to allocate the
-	 * trace buffer and initialize the values.
-	 */
-	if (dtrace_helptrace_enabled) {
-		ASSERT(dtrace_helptrace_buffer == NULL);
-		dtrace_helptrace_buffer =
-		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
-		dtrace_helptrace_next = 0;
-	}
-
-	/*
 	 * If there are already providers, we must ask them to provide their
 	 * probes, and then match any anonymous enabling against them.  Note
 	 * that there should be no other retained enablings at this time:
@@ -15559,20 +16792,15 @@
 
 	return (DDI_SUCCESS);
 }
-#endif
+#endif	/* illumos */
 
-#if !defined(sun)
-#if __FreeBSD_version >= 800039
-static void
-dtrace_dtr(void *data __unused)
-{
-}
+#ifndef illumos
+static void dtrace_dtr(void *);
 #endif
-#endif
 
 /*ARGSUSED*/
 static int
-#if defined(sun)
+#ifdef illumos
 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 #else
 dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
@@ -15583,7 +16811,7 @@
 	uid_t uid;
 	zoneid_t zoneid;
 
-#if defined(sun)
+#ifdef illumos
 	if (getminor(*devp) == DTRACEMNRN_HELPER)
 		return (0);
 
@@ -15591,30 +16819,11 @@
 	 * If this wasn't an open with the "helper" minor, then it must be
 	 * the "dtrace" minor.
 	 */
-	ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
+	if (getminor(*devp) == DTRACEMNRN_DTRACE)
+		return (ENXIO);
 #else
 	cred_t *cred_p = NULL;
-
-#if __FreeBSD_version < 800039
-	/*
-	 * The first minor device is the one that is cloned so there is
-	 * nothing more to do here.
-	 */
-	if (dev2unit(dev) == 0)
-		return 0;
-
-	/*
-	 * Devices are cloned, so if the DTrace state has already
-	 * been allocated, that means this device belongs to a
-	 * different client. Each client should open '/dev/dtrace'
-	 * to get a cloned device.
-	 */
-	if (dev->si_drv1 != NULL)
-		return (EBUSY);
-#endif
-
 	cred_p = dev->si_cred;
-#endif
 
 	/*
 	 * If no DTRACE_PRIV_* bits are set in the credential, then the
@@ -15622,12 +16831,7 @@
 	 */
 	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
 	if (priv == DTRACE_PRIV_NONE) {
-#if !defined(sun)
-#if __FreeBSD_version < 800039
-		/* Destroy the cloned device. */
-                destroy_dev(dev);
 #endif
-#endif
 
 		return (EACCES);
 	}
@@ -15644,7 +16848,7 @@
 	dtrace_opens++;
 	dtrace_membar_producer();
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * If the kernel debugger is active (that is, if the kernel debugger
 	 * modified text in some way), we won't allow the open.
@@ -15656,36 +16860,34 @@
 		return (EBUSY);
 	}
 
+	if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
+		/*
+		 * If DTrace helper tracing is enabled, we need to allocate the
+		 * trace buffer and initialize the values.
+		 */
+		dtrace_helptrace_buffer =
+		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
+		dtrace_helptrace_next = 0;
+		dtrace_helptrace_wrapped = 0;
+		dtrace_helptrace_enable = 0;
+	}
+
 	state = dtrace_state_create(devp, cred_p);
 #else
 	state = dtrace_state_create(dev);
-#if __FreeBSD_version < 800039
-	dev->si_drv1 = state;
-#else
 	devfs_set_cdevpriv(state, dtrace_dtr);
 #endif
-	/* This code actually belongs in dtrace_attach() */
-	if (dtrace_opens == 1)
-		dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
-		    1, INT_MAX, 0);
-#endif
 
 	mutex_exit(&cpu_lock);
 
 	if (state == NULL) {
-#if defined(sun)
-		if (--dtrace_opens == 0)
+#ifdef illumos
+		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 #else
 		--dtrace_opens;
 #endif
 		mutex_exit(&dtrace_lock);
-#if !defined(sun)
-#if __FreeBSD_version < 800039
-		/* Destroy the cloned device. */
-                destroy_dev(dev);
-#endif
-#endif
 		return (EAGAIN);
 	}
 
@@ -15695,82 +16897,92 @@
 }
 
 /*ARGSUSED*/
+#ifdef illumos
 static int
-#if defined(sun)
 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 #else
-dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
+static void
+dtrace_dtr(void *data)
 #endif
 {
-#if defined(sun)
+#ifdef illumos
 	minor_t minor = getminor(dev);
 	dtrace_state_t *state;
+#endif
+	dtrace_helptrace_t *buf = NULL;
 
+#ifdef illumos
 	if (minor == DTRACEMNRN_HELPER)
 		return (0);
 
 	state = ddi_get_soft_state(dtrace_softstate, minor);
 #else
-#if __FreeBSD_version < 800039
-	dtrace_state_t *state = dev->si_drv1;
+	dtrace_state_t *state = data;
+#endif
 
-	/* Check if this is not a cloned device. */
-	if (dev2unit(dev) == 0)
-		return (0);
+	mutex_enter(&cpu_lock);
+	mutex_enter(&dtrace_lock);
+
+#ifdef illumos
+	if (state->dts_anon)
 #else
-	dtrace_state_t *state;
-	devfs_get_cdevpriv((void **) &state);
+	if (state != NULL && state->dts_anon)
 #endif
+	{
+		/*
+		 * There is anonymous state. Destroy that first.
+		 */
+		ASSERT(dtrace_anon.dta_state == NULL);
+		dtrace_state_destroy(state->dts_anon);
+	}
 
-#endif
+	if (dtrace_helptrace_disable) {
+		/*
+		 * If we have been told to disable helper tracing, set the
+		 * buffer to NULL before calling into dtrace_state_destroy();
+		 * we take advantage of its dtrace_sync() to know that no
+		 * CPU is in probe context with enabled helper tracing
+		 * after it returns.
+		 */
+		buf = dtrace_helptrace_buffer;
+		dtrace_helptrace_buffer = NULL;
+	}
 
-	mutex_enter(&cpu_lock);
-	mutex_enter(&dtrace_lock);
-
+#ifdef illumos
+	dtrace_state_destroy(state);
+#else
 	if (state != NULL) {
-		if (state->dts_anon) {
-			/*
-			 * There is anonymous state. Destroy that first.
-			 */
-			ASSERT(dtrace_anon.dta_state == NULL);
-			dtrace_state_destroy(state->dts_anon);
-		}
-
 		dtrace_state_destroy(state);
-
-#if !defined(sun)
 		kmem_free(state, 0);
-#if __FreeBSD_version < 800039
-		dev->si_drv1 = NULL;
+	}
 #endif
-#endif
-	}
+	ASSERT(dtrace_opens > 0);
 
-	ASSERT(dtrace_opens > 0);
-#if defined(sun)
-	if (--dtrace_opens == 0)
+#ifdef illumos
+	/*
+	 * Only relinquish control of the kernel debugger interface when there
+	 * are no consumers and no anonymous enablings.
+	 */
+	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 #else
 	--dtrace_opens;
-	/* This code actually belongs in dtrace_detach() */
-	if ((dtrace_opens == 0) && (dtrace_taskq != NULL)) {
-		taskq_destroy(dtrace_taskq);
-		dtrace_taskq = NULL;
-	}
 #endif
 
+	if (buf != NULL) {
+		kmem_free(buf, dtrace_helptrace_bufsize);
+		dtrace_helptrace_disable = 0;
+	}
+
 	mutex_exit(&dtrace_lock);
 	mutex_exit(&cpu_lock);
 
-#if __FreeBSD_version < 800039
-	/* Schedule this cloned device to be destroyed. */
-	destroy_dev_sched(dev);
+#ifdef illumos
+	return (0);
 #endif
-
-	return (0);
 }
 
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
@@ -16385,6 +17597,7 @@
 			desc.dtbd_drops = buf->dtb_drops;
 			desc.dtbd_errors = buf->dtb_errors;
 			desc.dtbd_oldest = buf->dtb_xamot_offset;
+			desc.dtbd_timestamp = dtrace_gethrtime();
 
 			mutex_exit(&dtrace_lock);
 
@@ -16437,6 +17650,7 @@
 		desc.dtbd_drops = buf->dtb_xamot_drops;
 		desc.dtbd_errors = buf->dtb_xamot_errors;
 		desc.dtbd_oldest = 0;
+		desc.dtbd_timestamp = buf->dtb_switched;
 
 		mutex_exit(&dtrace_lock);
 
@@ -16652,13 +17866,11 @@
 	dtrace_modload = NULL;
 	dtrace_modunload = NULL;
 
+	ASSERT(dtrace_getf == 0);
+	ASSERT(dtrace_closef == NULL);
+
 	mutex_exit(&cpu_lock);
 
-	if (dtrace_helptrace_enabled) {
-		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
-		dtrace_helptrace_buffer = NULL;
-	}
-
 	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
 	dtrace_probes = NULL;
 	dtrace_nprobes = 0;
@@ -16709,7 +17921,7 @@
 }
 #endif
 
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
@@ -16732,7 +17944,7 @@
 }
 #endif
 
-#if defined(sun)
+#ifdef illumos
 static struct cb_ops dtrace_cb_ops = {
 	dtrace_open,		/* open */
 	dtrace_close,		/* close */
@@ -16800,14 +18012,8 @@
 static d_ioctl_t	dtrace_ioctl_helper;
 static void		dtrace_load(void *);
 static int		dtrace_unload(void);
-#if __FreeBSD_version < 800039
-static void		dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
-static struct clonedevs	*dtrace_clones;		/* Ptr to the array of cloned devices. */
-static eventhandler_tag	eh_tag;			/* Event handler tag. */
-#else
 static struct cdev	*dtrace_dev;
 static struct cdev	*helper_dev;
-#endif
 
 void dtrace_invop_init(void);
 void dtrace_invop_uninit(void);
@@ -16814,8 +18020,6 @@
 
 static struct cdevsw dtrace_cdevsw = {
 	.d_version	= D_VERSION,
-	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
-	.d_close	= dtrace_close,
 	.d_ioctl	= dtrace_ioctl,
 	.d_open		= dtrace_open,
 	.d_name		= "dtrace",
@@ -16823,15 +18027,11 @@
 
 static struct cdevsw helper_cdevsw = {
 	.d_version	= D_VERSION,
-	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
 	.d_ioctl	= dtrace_ioctl_helper,
 	.d_name		= "helper",
 };
 
 #include <dtrace_anon.c>
-#if __FreeBSD_version < 800039
-#include <dtrace_clone.c>
-#endif
 #include <dtrace_ioctl.c>
 #include <dtrace_load.c>
 #include <dtrace_modevent.c>
@@ -16847,6 +18047,5 @@
 
 DEV_MODULE(dtrace, dtrace_modevent, NULL);
 MODULE_VERSION(dtrace, 1);
-MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
 MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  *
  * Portions Copyright 2010 The FreeBSD Foundation
  *
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 253394 2013-07-16 15:51:32Z avg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 299003 2016-05-03 20:08:05Z markj $
  */
 
 /*
@@ -28,9 +29,9 @@
  * Use is subject to license terms.
  */
 
-#if defined(sun)
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-#endif
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
 
 #include <sys/atomic.h>
 #include <sys/errno.h>
@@ -38,13 +39,13 @@
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/ddi.h>
 #endif
 #include <sys/sunddi.h>
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/strsubr.h>
 #endif
 #include <sys/fasttrap.h>
@@ -55,14 +56,24 @@
 #include <sys/sysmacros.h>
 #include <sys/proc.h>
 #include <sys/policy.h>
-#if defined(sun)
+#ifdef illumos
 #include <util/qsort.h>
 #endif
 #include <sys/mutex.h>
 #include <sys/kernel.h>
-#if !defined(sun)
+#ifndef illumos
+#include <sys/dtrace_bsd.h>
+#include <sys/eventhandler.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
+#include <sys/u8_textprep.h>
 #include <sys/user.h>
-#include <sys/dtrace_bsd.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+
 #include <cddl/dev/dtrace/dtrace_cddl.h>
 #endif
 
@@ -155,9 +166,9 @@
 static struct cdev *fasttrap_cdev;
 static dtrace_meta_provider_id_t fasttrap_meta_id;
 
-static struct callout fasttrap_timeout;
+static struct proc *fasttrap_cleanup_proc;
 static struct mtx fasttrap_cleanup_mtx;
-static uint_t fasttrap_cleanup_work;
+static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
 
 /*
  * Generation count on modifications to the global tracepoint lookup table.
@@ -166,13 +177,14 @@
 
 /*
  * When the fasttrap provider is loaded, fasttrap_max is set to either
- * FASTTRAP_MAX_DEFAULT or the value for fasttrap-max-probes in the
- * fasttrap.conf file. Each time a probe is created, fasttrap_total is
- * incremented by the number of tracepoints that may be associated with that
- * probe; fasttrap_total is capped at fasttrap_max.
+ * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the
+ * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD).
+ * Each time a probe is created, fasttrap_total is incremented by the number
+ * of tracepoints that may be associated with that probe; fasttrap_total is capped
+ * at fasttrap_max.
  */
 #define	FASTTRAP_MAX_DEFAULT		250000
-static uint32_t fasttrap_max;
+static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT;
 static uint32_t fasttrap_total;
 
 /*
@@ -206,15 +218,31 @@
 static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
 static void fasttrap_proc_release(fasttrap_proc_t *);
 
+#ifndef illumos
+static void fasttrap_thread_dtor(void *, struct thread *);
+#endif
+
 #define	FASTTRAP_PROVS_INDEX(pid, name) \
 	((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
 
 #define	FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
 
-#if !defined(sun)
-static kmutex_t fasttrap_cpuc_pid_lock[MAXCPU];
+#ifndef illumos
+struct rmlock fasttrap_tp_lock;
+static eventhandler_tag fasttrap_thread_dtor_tag;
 #endif
 
+static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+
+#ifdef __FreeBSD__
+SYSCTL_DECL(_kern_dtrace);
+SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters");
+SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max,
+    FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes");
+SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size,
+    FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table");
+#endif
+
 static int
 fasttrap_highbit(ulong_t i)
 {
@@ -263,7 +291,7 @@
 void
 fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
 {
-#if defined(sun)
+#ifdef illumos
 	sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	sqp->sq_info.si_signo = SIGTRAP;
@@ -284,12 +312,124 @@
 	ksi->ksi_code = TRAP_DTRACE;
 	ksi->ksi_addr = (caddr_t)pc;
 	PROC_LOCK(p);
-	(void) tdksignal(t, SIGTRAP, ksi);
+	(void) tdsendsignal(p, t, SIGTRAP, ksi);
 	PROC_UNLOCK(p);
 #endif
 }
 
+#ifndef illumos
 /*
+ * Obtain a chunk of scratch space in the address space of the target process.
+ */
+fasttrap_scrspace_t *
+fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
+{
+	fasttrap_scrblock_t *scrblk;
+	fasttrap_scrspace_t *scrspc;
+	struct proc *p;
+	vm_offset_t addr;
+	int error, i;
+
+	scrspc = NULL;
+	if (td->t_dtrace_sscr != NULL) {
+		/* If the thread already has scratch space, we're done. */
+		scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+		return (scrspc);
+	}
+
+	p = td->td_proc;
+
+	mutex_enter(&fprc->ftpc_mtx);
+	if (LIST_EMPTY(&fprc->ftpc_fscr)) {
+		/*
+		 * No scratch space is available, so we'll map a new scratch
+		 * space block into the traced process' address space.
+		 */
+		addr = 0;
+		error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
+		    FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
+		    VM_PROT_ALL, 0);
+		if (error != KERN_SUCCESS)
+			goto done;
+
+		scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
+		scrblk->ftsb_addr = addr;
+		LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
+
+		/*
+		 * Carve the block up into chunks and put them on the free list.
+		 */
+		for (i = 0;
+		    i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
+			scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
+			scrspc->ftss_addr = addr +
+			    i * FASTTRAP_SCRSPACE_SIZE;
+			LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
+			    ftss_next);
+		}
+	}
+
+	/*
+	 * Take the first scratch chunk off the free list, put it on the
+	 * allocated list, and return its address.
+	 */
+	scrspc = LIST_FIRST(&fprc->ftpc_fscr);
+	LIST_REMOVE(scrspc, ftss_next);
+	LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
+
+	/*
+	 * This scratch space is reserved for use by td until the thread exits.
+	 */
+	td->t_dtrace_sscr = scrspc;
+
+done:
+	mutex_exit(&fprc->ftpc_mtx);
+
+	return (scrspc);
+}
+
+/*
+ * Return any allocated per-thread scratch space chunks back to the process'
+ * free list.
+ */
+static void
+fasttrap_thread_dtor(void *arg __unused, struct thread *td)
+{
+	fasttrap_bucket_t *bucket;
+	fasttrap_proc_t *fprc;
+	fasttrap_scrspace_t *scrspc;
+	pid_t pid;
+
+	if (td->t_dtrace_sscr == NULL)
+		return;
+
+	pid = td->td_proc->p_pid;
+	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+	fprc = NULL;
+
+	/* Look up the fasttrap process handle for this process. */
+	mutex_enter(&bucket->ftb_mtx);
+	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+		if (fprc->ftpc_pid == pid) {
+			mutex_enter(&fprc->ftpc_mtx);
+			mutex_exit(&bucket->ftb_mtx);
+			break;
+		}
+	}
+	if (fprc == NULL) {
+		mutex_exit(&bucket->ftb_mtx);
+		return;
+	}
+
+	scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+	LIST_REMOVE(scrspc, ftss_next);
+	LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
+
+	mutex_exit(&fprc->ftpc_mtx);
+}
+#endif
+
+/*
  * This function ensures that no threads are actively using the memory
  * associated with probes that were formerly live.
  */
@@ -303,15 +443,23 @@
 
 	fasttrap_mod_gen++;
 
+#ifdef illumos
 	CPU_FOREACH(i) {
 		mutex_enter(&fasttrap_cpuc_pid_lock[i]);
 		mutex_exit(&fasttrap_cpuc_pid_lock[i]);
 	}
+#else
+	rm_wlock(&fasttrap_tp_lock);
+	rm_wunlock(&fasttrap_tp_lock);
+#endif
 }
 
 /*
- * This is the timeout's callback for cleaning up the providers and their
- * probes.
+ * This function performs asynchronous cleanup of fasttrap providers. The
+ * Solaris implementation of this mechanism use a timeout that's activated in
+ * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
+ * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
+ * Thus we use a dedicated process to perform the cleanup when requested.
  */
 /*ARGSUSED*/
 static void
@@ -322,11 +470,8 @@
 	dtrace_provider_id_t provid;
 	int i, later = 0, rval;
 
-	static volatile int in = 0;
-	ASSERT(in == 0);
-	in = 1;
-
-	while (fasttrap_cleanup_work) {
+	mtx_lock(&fasttrap_cleanup_mtx);
+	while (!fasttrap_cleanup_drain || later > 0) {
 		fasttrap_cleanup_work = 0;
 		mtx_unlock(&fasttrap_cleanup_mtx);
 
@@ -397,39 +542,32 @@
 			}
 			mutex_exit(&bucket->ftb_mtx);
 		}
+		mtx_lock(&fasttrap_cleanup_mtx);
 
-		mtx_lock(&fasttrap_cleanup_mtx);
+		/*
+		 * If we were unable to retire a provider, try again after a
+		 * second. This situation can occur in certain circumstances
+		 * where providers cannot be unregistered even though they have
+		 * no probes enabled because of an execution of dtrace -l or
+		 * something similar.
+		 */
+		if (later > 0 || fasttrap_cleanup_work ||
+		    fasttrap_cleanup_drain) {
+			mtx_unlock(&fasttrap_cleanup_mtx);
+			pause("ftclean", hz);
+			mtx_lock(&fasttrap_cleanup_mtx);
+		} else
+			mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
+			    0, "ftcl", 0);
 	}
 
-#if 0
-	ASSERT(fasttrap_timeout != 0);
-#endif
-
 	/*
-	 * If we were unable to remove a retired provider, try again after
-	 * a second. This situation can occur in certain circumstances where
-	 * providers cannot be unregistered even though they have no probes
-	 * enabled because of an execution of dtrace -l or something similar.
-	 * If the timeout has been disabled (set to 1 because we're trying
-	 * to detach), we set fasttrap_cleanup_work to ensure that we'll
-	 * get a chance to do that work if and when the timeout is reenabled
-	 * (if detach fails).
+	 * Wake up the thread in fasttrap_unload() now that we're done.
 	 */
-	if (later > 0) {
-		if (callout_active(&fasttrap_timeout)) {
-			callout_reset(&fasttrap_timeout, hz,
-			    &fasttrap_pid_cleanup_cb, NULL);
-		}
- 
-	else if (later > 0)
-		fasttrap_cleanup_work = 1;
-	} else {
-#if !defined(sun)
-		/* Nothing to be done for FreeBSD */
-#endif
-	}
+	wakeup(&fasttrap_cleanup_drain);
+	mtx_unlock(&fasttrap_cleanup_mtx);
 
-	in = 0;
+	kthread_exit();
 }
 
 /*
@@ -440,8 +578,10 @@
 {
 
 	mtx_lock(&fasttrap_cleanup_mtx);
-	fasttrap_cleanup_work = 1;
-	callout_reset(&fasttrap_timeout, 1, &fasttrap_pid_cleanup_cb, NULL);
+	if (!fasttrap_cleanup_work) {
+		fasttrap_cleanup_work = 1;
+		wakeup(&fasttrap_cleanup_cv);
+	}
 	mtx_unlock(&fasttrap_cleanup_mtx);
 }
 
@@ -454,16 +594,20 @@
 static void
 fasttrap_fork(proc_t *p, proc_t *cp)
 {
+#ifndef illumos
+	fasttrap_scrblock_t *scrblk;
+	fasttrap_proc_t *fprc = NULL;
+#endif
 	pid_t ppid = p->p_pid;
 	int i;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(curproc == p);
 	ASSERT(p->p_proc_flag & P_PR_LOCK);
 #else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #endif
-#if defined(sun)
+#ifdef illumos
 	ASSERT(p->p_dtrace_count > 0);
 #else
 	if (p->p_dtrace_helpers) {
@@ -502,12 +646,18 @@
 	 * We don't have to worry about the child process disappearing
 	 * because we're in fork().
 	 */
-#if defined(sun)
+#ifdef illumos
 	mtx_lock_spin(&cp->p_slock);
 	sprlock_proc(cp);
 	mtx_unlock_spin(&cp->p_slock);
 #else
+	/*
+	 * fasttrap_tracepoint_remove() expects the child process to be
+	 * unlocked and the VM then expects curproc to be unlocked.
+	 */
 	_PHOLD(cp);
+	PROC_UNLOCK(cp);
+	PROC_UNLOCK(p);
 #endif
 
 	/*
@@ -533,15 +683,36 @@
 				 * mid-fork.
 				 */
 				ASSERT(tp->ftt_proc->ftpc_acount != 0);
+#ifndef illumos
+				fprc = tp->ftt_proc;
+#endif
 			}
 		}
 		mutex_exit(&bucket->ftb_mtx);
+
+#ifndef illumos
+		/*
+		 * Unmap any scratch space inherited from the parent's address
+		 * space.
+		 */
+		if (fprc != NULL) {
+			mutex_enter(&fprc->ftpc_mtx);
+			LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
+				vm_map_remove(&cp->p_vmspace->vm_map,
+				    scrblk->ftsb_addr,
+				    scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
+			}
+			mutex_exit(&fprc->ftpc_mtx);
+		}
+#endif
 	}
 
-#if defined(sun)
+#ifdef illumos
 	mutex_enter(&cp->p_lock);
 	sprunlock(cp);
 #else
+	PROC_LOCK(p);
+	PROC_LOCK(cp);
 	_PRELE(cp);
 #endif
 }
@@ -554,12 +725,24 @@
 static void
 fasttrap_exec_exit(proc_t *p)
 {
-#if defined(sun)
+#ifndef illumos
+	struct thread *td;
+#endif
+
+#ifdef illumos
 	ASSERT(p == curproc);
-#endif
+#else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	_PHOLD(p);
+	/*
+	 * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
+	 * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
+	 * ourselves when a process exits.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td)
+		td->t_dtrace_sscr = NULL;
 	PROC_UNLOCK(p);
+#endif
 
 	/*
 	 * We clean up the pid provider for this process here; user-land
@@ -566,12 +749,12 @@
 	 * static probes are handled by the meta-provider remove entry point.
 	 */
 	fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
-#if !defined(sun)
+#ifndef illumos
 	if (p->p_dtrace_helpers)
 		dtrace_helpers_destroy(p);
-#endif
 	PROC_LOCK(p);
 	_PRELE(p);
+#endif
 }
 
 
@@ -601,7 +784,7 @@
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(!(p->p_flag & SVFORK));
 #endif
 
@@ -709,7 +892,7 @@
 		 * Increment the count of the number of tracepoints active in
 		 * the victim process.
 		 */
-#if defined(sun)
+#ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count++;
@@ -901,7 +1084,7 @@
 		 * Decrement the count of the number of tracepoints active
 		 * in the victim process.
 		 */
-#if defined(sun)
+#ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count--;
@@ -954,7 +1137,7 @@
 static void
 fasttrap_disable_callbacks(void)
 {
-#if defined(sun)
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
@@ -963,7 +1146,7 @@
 	ASSERT(fasttrap_pid_count > 0);
 	fasttrap_pid_count--;
 	if (fasttrap_pid_count == 0) {
-#if defined(sun)
+#ifdef illumos
 		cpu_t *cur, *cpu = CPU;
 
 		for (cur = cpu->cpu_next_onln; cur != cpu;
@@ -973,7 +1156,7 @@
 #endif
 		dtrace_pid_probe_ptr = NULL;
 		dtrace_return_probe_ptr = NULL;
-#if defined(sun)
+#ifdef illumos
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_exit(&cur->cpu_ft_lock);
@@ -991,11 +1174,10 @@
 	proc_t *p = NULL;
 	int i, rc;
 
-
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(id == probe->ftp_id);
-#if defined(sun)
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
@@ -1022,7 +1204,7 @@
 	 * a fork in which the traced process is being born and we're copying
 	 * USDT probes. Otherwise, the process is gone so bail.
 	 */
-#if defined(sun)
+#ifdef illumos
 	if ((p = sprlock(probe->ftp_pid)) == NULL) {
 		if ((curproc->p_flag & SFORKING) == 0)
 			return;
@@ -1090,7 +1272,7 @@
 				i--;
 			}
 
-#if defined(sun)
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			sprunlock(p);
 #else
@@ -1105,7 +1287,7 @@
 			return;
 		}
 	}
-#if defined(sun)
+#ifdef illumos
 	mutex_enter(&p->p_lock);
 	sprunlock(p);
 #else
@@ -1136,8 +1318,13 @@
 	 */
 	if ((p = pfind(probe->ftp_pid)) != NULL) {
 #ifdef __FreeBSD__
-		_PHOLD(p);
-		PROC_UNLOCK(p);
+		if (p->p_flag & P_WEXIT) {
+			PROC_UNLOCK(p);
+			p = NULL;
+		} else {
+			_PHOLD(p);
+			PROC_UNLOCK(p);
+		}
 #endif
 	}
 
@@ -1184,7 +1371,7 @@
 
 	probe->ftp_enabled = 0;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 	fasttrap_disable_callbacks();
@@ -1307,7 +1494,7 @@
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
@@ -1325,7 +1512,7 @@
 	new_fprc->ftpc_pid = pid;
 	new_fprc->ftpc_rcount = 1;
 	new_fprc->ftpc_acount = 1;
-#if !defined(sun)
+#ifndef illumos
 	mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
 	    NULL);
 #endif
@@ -1341,7 +1528,7 @@
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
@@ -1365,6 +1552,12 @@
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, **fprcp;
 	pid_t pid = proc->ftpc_pid;
+#ifndef illumos
+	fasttrap_scrblock_t *scrblk, *scrblktmp;
+	fasttrap_scrspace_t *scrspc, *scrspctmp;
+	struct proc *p;
+	struct thread *td;
+#endif
 
 	mutex_enter(&proc->ftpc_mtx);
 
@@ -1376,6 +1569,31 @@
 		return;
 	}
 
+#ifndef illumos
+	/*
+	 * Free all structures used to manage per-thread scratch space.
+	 */
+	LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
+	    scrblktmp) {
+		LIST_REMOVE(scrblk, ftsb_next);
+		free(scrblk, M_SOLARIS);
+	}
+	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
+		LIST_REMOVE(scrspc, ftss_next);
+		free(scrspc, M_SOLARIS);
+	}
+	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
+		LIST_REMOVE(scrspc, ftss_next);
+		free(scrspc, M_SOLARIS);
+	}
+
+	if ((p = pfind(pid)) != NULL) {
+		FOREACH_THREAD_IN_PROC(p, td)
+			td->t_dtrace_sscr = NULL;
+		PROC_UNLOCK(p);
+	}
+#endif
+
 	mutex_exit(&proc->ftpc_mtx);
 
 	/*
@@ -1473,7 +1691,7 @@
 	new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
 	new_fp->ftp_pid = pid;
 	new_fp->ftp_proc = fasttrap_proc_lookup(pid);
-#if !defined(sun)
+#ifndef illumos
 	mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
 	mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
 #endif
@@ -1547,7 +1765,7 @@
 	 * count of active providers on the associated process structure.
 	 */
 	if (!provider->ftp_retired) {
-		atomic_add_64(&provider->ftp_proc->ftpc_acount, -1);
+		atomic_dec_64(&provider->ftp_proc->ftpc_acount);
 		ASSERT(provider->ftp_proc->ftpc_acount <
 		    provider->ftp_proc->ftpc_rcount);
 	}
@@ -1554,7 +1772,7 @@
 
 	fasttrap_proc_release(provider->ftp_proc);
 
-#if !defined(sun)
+#ifndef illumos
 	mutex_destroy(&provider->ftp_mtx);
 	mutex_destroy(&provider->ftp_cmtx);
 #endif
@@ -1572,7 +1790,7 @@
 	}
 
 	p->p_dtrace_probes--;
-#if !defined(sun)
+#ifndef illumos
 	PROC_UNLOCK(p);
 #endif
 }
@@ -1623,7 +1841,7 @@
 	 * bucket lock therefore protects the integrity of the provider hash
 	 * table.
 	 */
-	atomic_add_64(&fp->ftp_proc->ftpc_acount, -1);
+	atomic_dec_64(&fp->ftp_proc->ftpc_acount);
 	ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
 
 	fp->ftp_retired = 1;
@@ -1719,10 +1937,10 @@
 			    pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
 				continue;
 
-			atomic_add_32(&fasttrap_total, 1);
+			atomic_inc_32(&fasttrap_total);
 
 			if (fasttrap_total > fasttrap_max) {
-				atomic_add_32(&fasttrap_total, -1);
+				atomic_dec_32(&fasttrap_total);
 				goto no_mem;
 			}
 
@@ -2062,20 +2280,15 @@
 		return (EAGAIN);
 
 	if (cmd == FASTTRAPIOC_MAKEPROBE) {
-		fasttrap_probe_spec_t *uprobe = (void *)arg;
+		fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
 		fasttrap_probe_spec_t *probe;
 		uint64_t noffs;
 		size_t size;
-		int ret;
-		char *c;
+		int ret, err;
 
-#if defined(sun)
 		if (copyin(&uprobe->ftps_noffs, &noffs,
 		    sizeof (uprobe->ftps_noffs)))
 			return (EFAULT);
-#else
-		noffs = uprobe->ftps_noffs;
-#endif
 
 		/*
 		 * Probes must have at least one tracepoint.
@@ -2091,36 +2304,26 @@
 
 		probe = kmem_alloc(size, KM_SLEEP);
 
-#if defined(sun)
-		if (copyin(uprobe, probe, size) != 0) {
+		if (copyin(uprobe, probe, size) != 0 ||
+		    probe->ftps_noffs != noffs) {
 			kmem_free(probe, size);
 			return (EFAULT);
 		}
-#else
-		memcpy(probe, uprobe, sizeof(*probe));
-		if (noffs > 1 && copyin(uprobe + 1, probe + 1, size) != 0) {
-			kmem_free(probe, size);
-			return (EFAULT);
-		}
-#endif
 
-
 		/*
 		 * Verify that the function and module strings contain no
 		 * funny characters.
 		 */
-		for (c = &probe->ftps_func[0]; *c != '\0'; c++) {
-			if (*c < 0x20 || 0x7f <= *c) {
-				ret = EINVAL;
-				goto err;
-			}
+		if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
+		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+			ret = EINVAL;
+			goto err;
 		}
 
-		for (c = &probe->ftps_mod[0]; *c != '\0'; c++) {
-			if (*c < 0x20 || 0x7f <= *c) {
-				ret = EINVAL;
-				goto err;
-			}
+		if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
+		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+			ret = EINVAL;
+			goto err;
 		}
 
 #ifdef notyet
@@ -2128,7 +2331,7 @@
 			proc_t *p;
 			pid_t pid = probe->ftps_pid;
 
-#if defined(sun)
+#ifdef illumos
 			mutex_enter(&pidlock);
 #endif
 			/*
@@ -2139,12 +2342,12 @@
 			if (p)
 				fill_kinfo_proc(p, &kp);
 			if (p == NULL || kp.ki_stat == SIDL) {
-#if defined(sun)
+#ifdef illumos
 				mutex_exit(&pidlock);
 #endif
 				return (ESRCH);
 			}
-#if defined(sun)
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 #else
@@ -2154,7 +2357,7 @@
 #ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD | VWRITE)) != 0) {
-#if defined(sun)
+#ifdef illumos
 				mutex_exit(&p->p_lock);
 #else
 				PROC_UNLOCK(p);
@@ -2162,7 +2365,7 @@
 				return (ret);
 			}
 #endif /* notyet */
-#if defined(sun)
+#ifdef illumos
 			mutex_exit(&p->p_lock);
 #else
 			PROC_UNLOCK(p);
@@ -2180,11 +2383,11 @@
 		fasttrap_instr_query_t instr;
 		fasttrap_tracepoint_t *tp;
 		uint_t index;
-#if defined(sun)
+#ifdef illumos
 		int ret;
 #endif
 
-#if defined(sun)
+#ifdef illumos
 		if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
 			return (EFAULT);
 #endif
@@ -2194,7 +2397,7 @@
 			proc_t *p;
 			pid_t pid = instr.ftiq_pid;
 
-#if defined(sun)
+#ifdef illumos
 			mutex_enter(&pidlock);
 #endif
 			/*
@@ -2205,12 +2408,12 @@
 			if (p)
 				fill_kinfo_proc(p, &kp);
 			if (p == NULL || kp.ki_stat == SIDL) {
-#if defined(sun)
+#ifdef illumos
 				mutex_exit(&pidlock);
 #endif
 				return (ESRCH);
 			}
-#if defined(sun)
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 #else
@@ -2220,7 +2423,7 @@
 #ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD)) != 0) {
-#if defined(sun)
+#ifdef illumos
 				mutex_exit(&p->p_lock);
 #else
 				PROC_UNLOCK(p);
@@ -2229,7 +2432,7 @@
 			}
 #endif /* notyet */
 
-#if defined(sun)
+#ifdef illumos
 			mutex_exit(&p->p_lock);
 #else
 			PROC_UNLOCK(p);
@@ -2272,7 +2475,7 @@
 fasttrap_load(void)
 {
 	ulong_t nent;
-	int i;
+	int i, ret;
 
         /* Create the /dev/dtrace/fasttrap entry. */
         fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
@@ -2279,15 +2482,12 @@
             "dtrace/fasttrap");
 
 	mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
-	callout_init_mtx(&fasttrap_timeout, &fasttrap_cleanup_mtx, 0);
 	mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
 	    NULL);
 
-#if defined(sun)
+#ifdef illumos
 	fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
-#else
-	fasttrap_max = FASTTRAP_MAX_DEFAULT;
 #endif
 	fasttrap_total = 0;
 
@@ -2294,17 +2494,19 @@
 	/*
 	 * Conjure up the tracepoints hashtable...
 	 */
-#if defined(sun)
+#ifdef illumos
 	nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
 #else
-	nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+	nent = tpoints_hash_size;
 #endif
 
 	if (nent == 0 || nent > 0x1000000)
 		nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 
-	if ((nent & (nent - 1)) == 0)
+	tpoints_hash_size = nent;
+
+	if (ISP2(nent))
 		fasttrap_tpoints.fth_nent = nent;
 	else
 		fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2312,7 +2514,7 @@
 	fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
 	fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 		mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
 		    "tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
@@ -2322,7 +2524,7 @@
 	 * ... and the providers hash table...
 	 */
 	nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
-	if ((nent & (nent - 1)) == 0)
+	if (ISP2(nent))
 		fasttrap_provs.fth_nent = nent;
 	else
 		fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2330,17 +2532,35 @@
 	fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
 	fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
 	for (i = 0; i < fasttrap_provs.fth_nent; i++)
 		mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, 
 		    "providers bucket mtx", MUTEX_DEFAULT, NULL);
 #endif
 
+	ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
+	    &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
+	if (ret != 0) {
+		destroy_dev(fasttrap_cdev);
+#ifndef illumos
+		for (i = 0; i < fasttrap_provs.fth_nent; i++)
+			mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+		for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+			mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+#endif
+		kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
+		    sizeof (fasttrap_bucket_t));
+		mtx_destroy(&fasttrap_cleanup_mtx);
+		mutex_destroy(&fasttrap_count_mtx);
+		return (ret);
+	}
+
+
 	/*
 	 * ... and the procs hash table.
 	 */
 	nent = FASTTRAP_PROCS_DEFAULT_SIZE;
-	if ((nent & (nent - 1)) == 0)
+	if (ISP2(nent))
 		fasttrap_procs.fth_nent = nent;
 	else
 		fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2348,15 +2568,19 @@
 	fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
 	fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
 	for (i = 0; i < fasttrap_procs.fth_nent; i++)
 		mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
 		    "processes bucket mtx", MUTEX_DEFAULT, NULL);
 
-	CPU_FOREACH(i) {
-		mutex_init(&fasttrap_cpuc_pid_lock[i], "fasttrap barrier",
-		    MUTEX_DEFAULT, NULL);
-	}
+	rm_init(&fasttrap_tp_lock, "fasttrap tracepoint");
+
+	/*
+	 * This event handler must run before kdtrace_thread_dtor() since it
+	 * accesses the thread's struct kdtrace_thread.
+	 */
+	fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
+	    fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
 #endif
 
 	/*
@@ -2389,15 +2613,6 @@
 		return (-1);
 
 	/*
-	 * Prevent any new timeouts from running by setting fasttrap_timeout
-	 * to a non-zero value, and wait for the current timeout to complete.
-	 */
-	mtx_lock(&fasttrap_cleanup_mtx);
-	fasttrap_cleanup_work = 0;
-	callout_drain(&fasttrap_timeout);
-	mtx_unlock(&fasttrap_cleanup_mtx);
-
-	/*
 	 * Iterate over all of our providers. If there's still a process
 	 * that corresponds to that pid, fail to detach.
 	 */
@@ -2431,20 +2646,6 @@
 	}
 
 	if (fail) {
-		uint_t work;
-		/*
-		 * If we're failing to detach, we need to unblock timeouts
-		 * and start a new timeout if any work has accumulated while
-		 * we've been unsuccessfully trying to detach.
-		 */
-		mtx_lock(&fasttrap_cleanup_mtx);
-		work = fasttrap_cleanup_work;
-		callout_drain(&fasttrap_timeout);
-		mtx_unlock(&fasttrap_cleanup_mtx);
-
-		if (work)
-			fasttrap_pid_cleanup();
-
 		(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 		    &fasttrap_meta_id);
 
@@ -2451,6 +2652,29 @@
 		return (-1);
 	}
 
+	/*
+	 * Stop new processes from entering these hooks now, before the
+	 * fasttrap_cleanup thread runs.  That way all processes will hopefully
+	 * be out of these hooks before we free fasttrap_provs.fth_table
+	 */
+	ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
+	dtrace_fasttrap_fork = NULL;
+
+	ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
+	dtrace_fasttrap_exec = NULL;
+
+	ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
+	dtrace_fasttrap_exit = NULL;
+
+	mtx_lock(&fasttrap_cleanup_mtx);
+	fasttrap_cleanup_drain = 1;
+	/* Wait for the cleanup thread to finish up and signal us. */
+	wakeup(&fasttrap_cleanup_cv);
+	mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
+	    0);
+	fasttrap_cleanup_proc = NULL;
+	mtx_destroy(&fasttrap_cleanup_mtx);
+
 #ifdef DEBUG
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count == 0);
@@ -2457,6 +2681,16 @@
 	mutex_exit(&fasttrap_count_mtx);
 #endif
 
+#ifndef illumos
+	EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
+
+	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+		mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+	for (i = 0; i < fasttrap_provs.fth_nent; i++)
+		mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+	for (i = 0; i < fasttrap_procs.fth_nent; i++)
+		mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
+#endif
 	kmem_free(fasttrap_tpoints.fth_table,
 	    fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_tpoints.fth_nent = 0;
@@ -2469,28 +2703,10 @@
 	    fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_procs.fth_nent = 0;
 
-	/*
-	 * We know there are no tracepoints in any process anywhere in
-	 * the system so there is no process which has its p_dtrace_count
-	 * greater than zero, therefore we know that no thread can actively
-	 * be executing code in fasttrap_fork(). Similarly for p_dtrace_probes
-	 * and fasttrap_exec() and fasttrap_exit().
-	 */
-	ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
-	dtrace_fasttrap_fork = NULL;
-
-	ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
-	dtrace_fasttrap_exec = NULL;
-
-	ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
-	dtrace_fasttrap_exit = NULL;
-
-#if !defined(sun)
+#ifndef illumos
 	destroy_dev(fasttrap_cdev);
 	mutex_destroy(&fasttrap_count_mtx);
-	CPU_FOREACH(i) {
-		mutex_destroy(&fasttrap_cpuc_pid_lock[i]);
-	}
+	rm_destroy(&fasttrap_tp_lock);
 #endif
 
 	return (0);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -169,9 +170,9 @@
 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
 		return;
 
-	atomic_add_32(&profile_total, 1);
+	atomic_inc_32(&profile_total);
 	if (profile_total > profile_max) {
-		atomic_add_32(&profile_total, -1);
+		atomic_dec_32(&profile_total);
 		return;
 	}
 
@@ -326,7 +327,7 @@
 	kmem_free(prof, sizeof (profile_probe_t));
 
 	ASSERT(profile_total >= 1);
-	atomic_add_32(&profile_total, -1);
+	atomic_dec_32(&profile_total);
 }
 
 /*ARGSUSED*/

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -85,11 +87,11 @@
 
 sdt_provider_t sdt_providers[] = {
 	{ "vtrace", "__vtrace_", &vtrace_attr, 0 },
-	{ "sysinfo", "__cpu_sysinfo_", &info_attr, 0 },
-	{ "vminfo", "__cpu_vminfo_", &info_attr, 0 },
+	{ "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
+	{ "vminfo", "__cpu_vminfo_", &info_attr, DTRACE_PRIV_USER },
 	{ "fpuinfo", "__fpuinfo_", &fpu_attr, 0 },
-	{ "sched", "__sched_", &stab_attr, 0 },
-	{ "proc", "__proc_", &stab_attr, 0 },
+	{ "sched", "__sched_", &stab_attr, DTRACE_PRIV_USER },
+	{ "proc", "__proc_", &stab_attr, DTRACE_PRIV_USER },
 	{ "io", "__io_", &stab_attr, 0 },
 	{ "mib", "__mib_", &stab_attr, 0 },
 	{ "fsinfo", "__fsinfo_", &fsinfo_attr, 0 },
@@ -852,6 +854,20 @@
 };
 
 /*ARGSUSED*/
+int
+sdt_mode(void *arg, dtrace_id_t id, void *parg)
+{
+	/*
+	 * We tell DTrace that we're in kernel mode, that the firing needs to
+	 * be dropped for anything that doesn't have necessary privileges, and
+	 * that it needs to be restricted for anything that has restricted
+	 * (i.e., not all-zone) privileges.
+	 */
+	return (DTRACE_MODE_KERNEL | DTRACE_MODE_NOPRIV_DROP |
+	    DTRACE_MODE_LIMITEDPRIV_RESTRICT);
+}
+
+/*ARGSUSED*/
 void
 sdt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
 {

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -165,11 +166,11 @@
 		return;
 	}
 
-	(void) casptr(&sysent[sysnum].sy_callc,
+	(void) atomic_cas_ptr(&sysent[sysnum].sy_callc,
 	    (void *)systrace_sysent[sysnum].stsy_underlying,
 	    (void *)dtrace_systrace_syscall);
 #ifdef _SYSCALL32_IMPL
-	(void) casptr(&sysent32[sysnum].sy_callc,
+	(void) atomic_cas_ptr(&sysent32[sysnum].sy_callc,
 	    (void *)systrace_sysent32[sysnum].stsy_underlying,
 	    (void *)dtrace_systrace_syscall32);
 #endif
@@ -184,12 +185,12 @@
 	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
 
 	if (disable) {
-		(void) casptr(&sysent[sysnum].sy_callc,
+		(void) atomic_cas_ptr(&sysent[sysnum].sy_callc,
 		    (void *)dtrace_systrace_syscall,
 		    (void *)systrace_sysent[sysnum].stsy_underlying);
 
 #ifdef _SYSCALL32_IMPL
-		(void) casptr(&sysent32[sysnum].sy_callc,
+		(void) atomic_cas_ptr(&sysent32[sysnum].sy_callc,
 		    (void *)dtrace_systrace_syscall32,
 		    (void *)systrace_sysent32[sysnum].stsy_underlying);
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -82,9 +83,9 @@
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
@@ -105,13 +106,13 @@
  * with the buffer may be evicted prior to the callback.  The callback
  * must be made with *no locks held* (to prevent deadlock).  Additionally,
  * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
+ * protected from simultaneous callbacks from arc_clear_callback()
  * and arc_do_user_evicts().
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
@@ -120,14 +121,141 @@
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                   (potentially) |      |             |               |
+ *                     compressed  |      |             |               |
+ *                        data     +------+             |               v
+ *                                                      +->+------+     +------+
+ *                                            uncompressed |      |     |      |
+ *                                                data     |      |     |      |
+ *                                                         +------+     +------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t    (shared)
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                                 |      |             |               |
+ *                   uncompressed  |      |             |               |
+ *                        data     +------+             |               |
+ *                                    ^                 +->+------+     |
+ *                                    |       uncompressed |      |     |
+ *                                    |           data     |      |     |
+ *                                    |                    +------+     |
+ *                                    +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -137,7 +265,7 @@
 #include <zfs_fletcher.h>
 #include <sys/sdt.h>
 
-#include <vm/vm_pageout.h>
+#include <machine/vmparam.h>
 
 #ifdef illumos
 #ifndef _KERNEL
@@ -147,39 +275,66 @@
 #endif
 #endif /* illumos */
 
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
+static kmutex_t		arc_reclaim_lock;
+static kcondvar_t	arc_reclaim_thread_cv;
+static boolean_t	arc_reclaim_thread_exit;
+static kcondvar_t	arc_reclaim_waiters_cv;
 
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
+uint_t arc_reduce_dnlc_percent = 3;
 
-#define	ARC_REDUCE_DNLC_PERCENT	3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
 
-typedef enum arc_reclaim_strategy {
-	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
-	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int		zfs_arc_overflow_shift = 8;
+
 /* shift of arc_c for calculating both min and max arc_p */
 static int		arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-static int		arc_shrink_shift = 5;
+static int		arc_shrink_shift = 7;
 
 /*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int			arc_no_grow_shift = 5;
+
+
+/*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static int		arc_min_prefetch_lifespan;
 
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
 static int arc_dead;
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
 
 /*
  * The arc has filled available memory and has now warmed up.
@@ -192,21 +347,94 @@
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
+uint64_t zfs_arc_meta_min = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
+uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = 0;
 
+/* Absolute min for arc min / max is 16MB. */
+static uint64_t arc_abs_min = 16 << 20;
+
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+	zfs_arc_free_target = vm_pageout_wakeup_thresh;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
+
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
+TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
+TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
 SYSCTL_DECL(_vfs_zfs);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
-    "Maximum ARC size");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
-    "Minimum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
+    &zfs_arc_average_blocksize, 0,
+    "ARC average blocksize");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
+    &arc_shrink_shift, 0,
+    "log2(fraction of arc to reclaim)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
+    &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
 
 /*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+	u_int val;
+	int err;
+
+	val = zfs_arc_free_target;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < minfree)
+		return (EINVAL);
+	if (val > cnt.v_page_count)
+		return (EINVAL);
+
+	zfs_arc_free_target = val;
+
+	return (0);
+}
+
+/*
+ * Must be declared here, before the definition of corresponding kstat
+ * macro which uses the same names will confuse the compiler.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_arc_meta_limit, "QU",
+    "ARC metadata limit");
+#endif
+
+/*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
  *	ARC_mru		- recently used, currently cached
@@ -238,31 +466,22 @@
  * second level ARC benefit from these fast lookups.
  */
 
-#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
-struct arcs_lock {
-	kmutex_t	arcs_lock;
-#ifdef _KERNEL
-	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-/*
- * must be power of two for mask use to work
- *
- */
-#define ARC_BUFC_NUMDATALISTS		16
-#define ARC_BUFC_NUMMETADATALISTS	16
-#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
-
 typedef struct arc_state {
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
-	uint64_t arcs_size;	/* total amount of data in this state */
-	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
-	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
+	/*
+	 * list of evictable buffers
+	 */
+	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of evictable data in this state
+	 */
+	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of data in this state; this includes: evictable,
+	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+	 */
+	refcount_t arcs_size;
 } arc_state_t;
 
-#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
-
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@@ -288,8 +507,6 @@
 	kstat_named_t arcstat_mfu_ghost_hits;
 	kstat_named_t arcstat_allocated;
 	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_stolen;
-	kstat_named_t arcstat_recycle_miss;
 	/*
 	 * Number of buffers that could not be evicted because the hash lock
 	 * was held by another thread.  The lock may not necessarily be held
@@ -303,9 +520,15 @@
 	 * not from the spa we're trying to evict from.
 	 */
 	kstat_named_t arcstat_evict_skip;
+	/*
+	 * Number of times arc_evict_state() was unable to evict enough
+	 * buffers to reach it's target amount.
+	 */
+	kstat_named_t arcstat_evict_not_enough;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
+	kstat_named_t arcstat_evict_l2_skip;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -316,9 +539,157 @@
 	kstat_named_t arcstat_c_min;
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
+	/*
+	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+	 * Note that the compressed bytes may match the uncompressed bytes
+	 * if the block is either not compressed or compressed arc is disabled.
+	 */
+	kstat_named_t arcstat_compressed_size;
+	/*
+	 * Uncompressed size of the data stored in b_pdata. If compressed
+	 * arc is disabled then this value will be identical to the stat
+	 * above.
+	 */
+	kstat_named_t arcstat_uncompressed_size;
+	/*
+	 * Number of bytes stored in all the arc_buf_t's. This is classified
+	 * as "overhead" since this data is typically short-lived and will
+	 * be evicted from the arc when it becomes unreferenced unless the
+	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+	 * values have been set (see comment in dbuf.c for more information).
+	 */
+	kstat_named_t arcstat_overhead_size;
+	/*
+	 * Number of bytes consumed by internal ARC structures necessary
+	 * for tracking purposes; these structures are not actually
+	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
+	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+	 * caches), and arc_buf_t structures (allocated via arc_buf_t
+	 * cache).
+	 */
 	kstat_named_t arcstat_hdr_size;
+	/*
+	 * Number of bytes consumed by ARC buffers of type equal to
+	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
+	 * on disk user data (e.g. plain file contents).
+	 */
 	kstat_named_t arcstat_data_size;
+	/*
+	 * Number of bytes consumed by ARC buffers of type equal to
+	 * ARC_BUFC_METADATA. This is generally consumed by buffers
+	 * backing on disk data that is used for internal ZFS
+	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
+	 */
+	kstat_named_t arcstat_metadata_size;
+	/*
+	 * Number of bytes consumed by various buffers and structures
+	 * not actually backed with ARC buffers. This includes bonus
+	 * buffers (allocated directly via zio_buf_* functions),
+	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+	 * cache), and dnode_t structures (allocated via dnode_t cache).
+	 */
 	kstat_named_t arcstat_other_size;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_anon state. This includes *all* buffers in the arc_anon
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_anon_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_DATA,
+	 * residing in the arc_anon state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_anon_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
+	 * residing in the arc_anon state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_anon_evictable_metadata;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_mru state. This includes *all* buffers in the arc_mru
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_mru_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_DATA,
+	 * residing in the arc_mru state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_mru_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
+	 * residing in the arc_mru state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_mru_evictable_metadata;
+	/*
+	 * Total number of bytes that *would have been* consumed by ARC
+	 * buffers in the arc_mru_ghost state. The key thing to note
+	 * here, is the fact that this size doesn't actually indicate
+	 * RAM consumption. The ghost lists only consist of headers and
+	 * don't actually have ARC buffers linked off of these headers.
+	 * Thus, *if* the headers had associated ARC buffers, these
+	 * buffers *would have* consumed this number of bytes.
+	 */
+	kstat_named_t arcstat_mru_ghost_size;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mru_ghost_evictable_data;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mru_ghost_evictable_metadata;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_mfu state. This includes *all* buffers in the arc_mfu
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_mfu_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that are eligible for
+	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+	 * state.
+	 */
+	kstat_named_t arcstat_mfu_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that are eligible for
+	 * eviction, of type ARC_BUFC_METADATA, and reside in the
+	 * arc_mfu state.
+	 */
+	kstat_named_t arcstat_mfu_evictable_metadata;
+	/*
+	 * Total number of bytes that *would have been* consumed by ARC
+	 * buffers in the arc_mfu_ghost state. See the comment above
+	 * arcstat_mru_ghost_size for more details.
+	 */
+	kstat_named_t arcstat_mfu_ghost_size;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+	 */
+	kstat_named_t arcstat_mfu_ghost_evictable_data;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	kstat_named_t arcstat_l2_feeds;
@@ -328,9 +699,10 @@
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_writes_lock_retry;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
+	kstat_named_t arcstat_l2_evict_l1cached;
 	kstat_named_t arcstat_l2_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
@@ -338,9 +710,6 @@
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
-	kstat_named_t arcstat_l2_compress_successes;
-	kstat_named_t arcstat_l2_compress_zeros;
-	kstat_named_t arcstat_l2_compress_failures;
 	kstat_named_t arcstat_l2_write_trylock_fail;
 	kstat_named_t arcstat_l2_write_passed_headroom;
 	kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -354,9 +723,12 @@
 	kstat_named_t arcstat_l2_write_buffer_list_iter;
 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 	kstat_named_t arcstat_memory_throttle_count;
-	kstat_named_t arcstat_duplicate_buffers;
-	kstat_named_t arcstat_duplicate_buffers_size;
-	kstat_named_t arcstat_duplicate_reads;
+	kstat_named_t arcstat_meta_used;
+	kstat_named_t arcstat_meta_limit;
+	kstat_named_t arcstat_meta_max;
+	kstat_named_t arcstat_meta_min;
+	kstat_named_t arcstat_sync_wait_for_async;
+	kstat_named_t arcstat_demand_hit_predictive_prefetch;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -376,13 +748,13 @@
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "allocated",			KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "stolen",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -393,9 +765,28 @@
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
+	{ "compressed_size",		KSTAT_DATA_UINT64 },
+	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
+	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
+	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "other_size",			KSTAT_DATA_UINT64 },
+	{ "anon_size",			KSTAT_DATA_UINT64 },
+	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_size",			KSTAT_DATA_UINT64 },
+	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "mfu_size",			KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
@@ -405,9 +796,10 @@
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
+	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
@@ -415,9 +807,6 @@
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
-	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
-	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
 	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
 	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
 	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
@@ -431,9 +820,12 @@
 	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
-	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
-	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
-	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
+	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
+	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
+	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -495,23 +887,22 @@
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
+#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
+#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
 
-#define	L2ARC_IS_VALID_COMPRESS(_c_) \
-	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+/* compressed size of entire arc */
+#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
-static uint64_t		arc_meta_used;
-static uint64_t		arc_meta_limit;
-static uint64_t		arc_meta_max = 0;
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
-    &arc_meta_used, 0, "ARC metadata used");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
-    &arc_meta_limit, 0, "ARC metadata limit");
 
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
@@ -527,36 +918,64 @@
 struct arc_write_callback {
 	void		*awcb_private;
 	arc_done_func_t	*awcb_ready;
+	arc_done_func_t	*awcb_children_ready;
+	arc_done_func_t	*awcb_physdone;
 	arc_done_func_t	*awcb_done;
 	arc_buf_t	*awcb_buf;
 };
 
-struct arc_buf_hdr {
-	/* protected by hash lock */
-	dva_t			b_dva;
-	uint64_t		b_birth;
-	uint64_t		b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ *   - Common fields struct, always defined, and embedded within it:
+ *       - L2-only fields, always allocated but undefined when not in L2ARC
+ *       - L1-only fields, only allocated when in L1ARC
+ *
+ *           Buffer in L1                     Buffer only in L2
+ *    +------------------------+          +------------------------+
+ *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
+ *    | (undefined if L1-only) |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l1arc_buf_hdr_t        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
+#ifdef ZFS_DEBUG
+	/*
+	 * used for debugging wtih kmem_flags - by allocating and freeing
+	 * b_thawed when the buffer is thawed, we get a record of the stack
+	 * trace that thawed it.
+	 */
 	void			*b_thawed;
+#endif
 
-	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
-	uint32_t		b_flags;
-	uint32_t		b_datacnt;
-
-	arc_callback_t		*b_acb;
+	uint32_t		b_bufcnt;
+	/* for waiting on writes to complete */
 	kcondvar_t		b_cv;
+	uint8_t			b_byteswap;
 
-	/* immutable */
-	arc_buf_contents_t	b_type;
-	uint64_t		b_size;
-	uint64_t		b_spa;
-
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
+	multilist_node_t	b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
@@ -564,66 +983,201 @@
 	/* self protecting */
 	refcount_t		b_refcnt;
 
-	l2arc_buf_hdr_t		*b_l2hdr;
+	arc_callback_t		*b_acb;
+	void			*b_pdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+	/* protected by arc_buf_hdr mutex */
+	l2arc_dev_t		*b_dev;		/* L2ARC device */
+	uint64_t		b_daddr;	/* disk address, offset byte */
+
 	list_node_t		b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+	/* protected by hash lock */
+	dva_t			b_dva;
+	uint64_t		b_birth;
+
+	arc_buf_contents_t	b_type;
+	arc_buf_hdr_t		*b_hash_next;
+	arc_flags_t		b_flags;
+
+	/*
+	 * This field stores the size of the data buffer after
+	 * compression, and is set in the arc's zio completion handlers.
+	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+	 *
+	 * While the block pointers can store up to 32MB in their psize
+	 * field, we can only store up to 32MB minus 512B. This is due
+	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+	 * a field of zeros represents 512B in the bp). We can't use a
+	 * bias of 1 since we need to reserve a psize of zero, here, to
+	 * represent holes and embedded blocks.
+	 *
+	 * This isn't a problem in practice, since the maximum size of a
+	 * buffer is limited to 16MB, so we never need to store 32MB in
+	 * this field. Even in the upstream illumos code base, the
+	 * maximum size of a buffer is limited to 16MB.
+	 */
+	uint16_t		b_psize;
+
+	/*
+	 * This field stores the size of the data buffer before
+	 * compression, and cannot change once set. It is in units
+	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+	 */
+	uint16_t		b_lsize;	/* immutable */
+	uint64_t		b_spa;		/* immutable */
+
+	/* L2ARC fields. Undefined when not in L2ARC. */
+	l2arc_buf_hdr_t		b_l2hdr;
+	/* L1ARC fields. Undefined when in l2arc_only state */
+	l1arc_buf_hdr_t		b_l1hdr;
 };
 
-static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
-static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
-#ifdef illumos
-static void arc_buf_watch(arc_buf_t *buf);
-#endif /* illumos */
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static int
+sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
 
-static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+	val = arc_meta_limit;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
 
+        if (val <= 0 || val > arc_c_max)
+		return (EINVAL);
+
+	arc_meta_limit = val;
+	return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_arc_max;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (zfs_arc_max == 0) {
+		/* Loader tunable so blindly set */
+		zfs_arc_max = val;
+		return (0);
+	}
+
+	if (val < arc_abs_min || val > kmem_size())
+		return (EINVAL);
+	if (val < arc_c_min)
+		return (EINVAL);
+	if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
+		return (EINVAL);
+
+	arc_c_max = val;
+
+	arc_c = arc_c_max;
+        arc_p = (arc_c >> 1);
+
+	if (zfs_arc_meta_limit == 0) {
+		/* limit meta-data to 1/4 of the arc capacity */
+		arc_meta_limit = arc_c_max / 4;
+	}
+
+	/* if kmem_flags are set, lets try to use less memory */
+	if (kmem_debugging())
+		arc_c = arc_c / 2;
+
+	zfs_arc_max = arc_c;
+
+	return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_arc_min;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (zfs_arc_min == 0) {
+		/* Loader tunable so blindly set */
+		zfs_arc_min = val;
+		return (0);
+	}
+
+	if (val < arc_abs_min || val > arc_c_max)
+		return (EINVAL);
+
+	arc_c_min = val;
+
+	if (zfs_arc_meta_min == 0)
+                arc_meta_min = arc_c_min / 2;
+
+	if (arc_c < arc_c_min)
+                arc_c = arc_c_min;
+
+	zfs_arc_min = arc_c_min;
+
+	return (0);
+}
+#endif
+
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
-/*
- * Private ARC flags.  These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read.  However, these flags
- * should never be passed and should only be set by ARC code.  When adding new
- * public flags, make sure not to smash the private ones.
- */
+#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define	HDR_COMPRESSION_ENABLED(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
-#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
-#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
-#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
-#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
-#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
-#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
-#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
-#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
-#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
-#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
+#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define	HDR_L2_READING(hdr)	\
+	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
+	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
-#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
-#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
-#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
-#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
-#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
-#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
-#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
-				    (hdr)->b_l2hdr != NULL)
-#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
-#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
-#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+#define	HDR_ISTYPE_METADATA(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
+#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
+
 /*
  * Other sizes
  */
 
-#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
@@ -703,68 +1257,74 @@
     &l2arc_norw, 0, "no reads during writes");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
-    &ARC_anon.arcs_size, 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
-    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
-    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of anonymous state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
-    &ARC_mru.arcs_size, 0, "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
-    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
-    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
     "size of metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
     "size of data in mru ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
-    &ARC_mfu.arcs_size, 0, "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
-    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
-    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
     "size of metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
     "size of data in mfu ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
-    &ARC_l2c_only.arcs_size, 0, "size of mru state");
+    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
 
 /*
  * L2ARC Internals
  */
-typedef struct l2arc_dev {
+struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
-	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
-	list_t			*l2ad_buflist;	/* buffer list */
+	kmutex_t		l2ad_mtx;	/* lock for buffer list */
+	list_t			l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
-} l2arc_dev_t;
+	refcount_t		l2ad_alloc;	/* allocated bytes */
+};
 
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
-static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
@@ -771,12 +1331,11 @@
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
-	arc_buf_t		*l2rcb_buf;		/* read buffer */
-	spa_t			*l2rcb_spa;		/* spa */
+	arc_buf_hdr_t		*l2rcb_hdr;		/* read buffer */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
-	zbookmark_t		l2rcb_zb;		/* original bookmark */
+	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
-	enum zio_compress	l2rcb_compress;		/* applied compress */
+	void			*l2rcb_data;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -784,23 +1343,11 @@
 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 } l2arc_write_callback_t;
 
-struct l2arc_buf_hdr {
-	/* protected by arc_buf_hdr  mutex */
-	l2arc_dev_t		*b_dev;		/* L2ARC device */
-	uint64_t		b_daddr;	/* disk address, offset byte */
-	/* compression applied to buffer data */
-	enum zio_compress	b_compress;
-	/* real alloc'd buffer size depending on b_compress applied */
-	int			b_asize;
-	/* temporary buffer holder for in-flight compressed data */
-	void			*b_tmp_cdata;
-};
-
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	void		*l2df_data;
 	size_t		l2df_size;
-	void		(*l2df_func)(void *, size_t);
+	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
@@ -808,15 +1355,36 @@
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
-static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static boolean_t arc_is_overflowing();
+static void arc_buf_watch(arc_buf_t *);
 
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
-static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
-    enum zio_compress c);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+
+static void
+l2arc_trim(const arc_buf_hdr_t *hdr)
+{
+	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+	ASSERT(HDR_HAS_L2HDR(hdr));
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+
+	if (HDR_GET_PSIZE(hdr) != 0) {
+		trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+		    HDR_GET_PSIZE(hdr), 0);
+	}
+}
+
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
@@ -834,15 +1402,14 @@
 	return (crc);
 }
 
-#define	BUF_EMPTY(buf)						\
-	((buf)->b_dva.dva_word[0] == 0 &&			\
-	(buf)->b_dva.dva_word[1] == 0 &&			\
-	(buf)->b_birth == 0)
+#define	HDR_EMPTY(hdr)						\
+	((hdr)->b_dva.dva_word[0] == 0 &&			\
+	(hdr)->b_dva.dva_word[1] == 0)
 
-#define	BUF_EQUAL(spa, dva, birth, buf)				\
-	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
-	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
-	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define	HDR_EQUAL(spa, dva, birth, hdr)				\
+	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
@@ -850,22 +1417,23 @@
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
-	hdr->b_cksum0 = 0;
 }
 
 static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
+	const dva_t *dva = BP_IDENTITY(bp);
+	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *buf;
+	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
-	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
-	    buf = buf->b_hash_next) {
-		if (BUF_EQUAL(spa, dva, birth, buf)) {
+	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+	    hdr = hdr->b_hash_next) {
+		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
-			return (buf);
+			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
@@ -878,28 +1446,37 @@
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *fbuf;
+	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
-	ASSERT(!HDR_IN_HASH_TABLE(buf));
-	*lockp = hash_lock;
-	mutex_enter(hash_lock);
-	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
-	    fbuf = fbuf->b_hash_next, i++) {
-		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
-			return (fbuf);
+	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+	ASSERT(hdr->b_birth != 0);
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (lockp != NULL) {
+		*lockp = hash_lock;
+		mutex_enter(hash_lock);
+	} else {
+		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
-	buf->b_hash_next = buf_hash_table.ht_table[idx];
-	buf_hash_table.ht_table[idx] = buf;
-	buf->b_flags |= ARC_IN_HASH_TABLE;
+	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+	    fhdr = fhdr->b_hash_next, i++) {
+		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+			return (fhdr);
+	}
 
+	hdr->b_hash_next = buf_hash_table.ht_table[idx];
+	buf_hash_table.ht_table[idx] = hdr;
+	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
@@ -916,22 +1493,22 @@
 }
 
 static void
-buf_hash_remove(arc_buf_hdr_t *buf)
+buf_hash_remove(arc_buf_hdr_t *hdr)
 {
-	arc_buf_hdr_t *fbuf, **bufp;
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	arc_buf_hdr_t *fhdr, **hdrp;
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
-	ASSERT(HDR_IN_HASH_TABLE(buf));
+	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
-	bufp = &buf_hash_table.ht_table[idx];
-	while ((fbuf = *bufp) != buf) {
-		ASSERT(fbuf != NULL);
-		bufp = &fbuf->b_hash_next;
+	hdrp = &buf_hash_table.ht_table[idx];
+	while ((fhdr = *hdrp) != hdr) {
+		ASSERT3P(fhdr, !=, NULL);
+		hdrp = &fhdr->b_hash_next;
 	}
-	*bufp = buf->b_hash_next;
-	buf->b_hash_next = NULL;
-	buf->b_flags &= ~ARC_IN_HASH_TABLE;
+	*hdrp = hdr->b_hash_next;
+	hdr->b_hash_next = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -944,7 +1521,8 @@
 /*
  * Global data structures and functions for the buf kmem cache.
  */
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
@@ -956,7 +1534,8 @@
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
-	kmem_cache_destroy(hdr_cache);
+	kmem_cache_destroy(hdr_full_cache);
+	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
@@ -966,15 +1545,16 @@
  */
 /* ARGSUSED */
 static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
-	arc_buf_hdr_t *buf = vbuf;
+	arc_buf_hdr_t *hdr = vbuf;
 
-	bzero(buf, sizeof (arc_buf_hdr_t));
-	refcount_create(&buf->b_refcnt);
-	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+	bzero(hdr, HDR_FULL_SIZE);
+	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+	refcount_create(&hdr->b_l1hdr.b_refcnt);
+	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
@@ -981,6 +1561,18 @@
 
 /* ARGSUSED */
 static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	bzero(hdr, HDR_L2ONLY_SIZE);
+	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_t *buf = vbuf;
@@ -998,19 +1590,30 @@
  */
 /* ARGSUSED */
 static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
 {
-	arc_buf_hdr_t *buf = vbuf;
+	arc_buf_hdr_t *hdr = vbuf;
 
-	ASSERT(BUF_EMPTY(buf));
-	refcount_destroy(&buf->b_refcnt);
-	cv_destroy(&buf->b_cv);
-	mutex_destroy(&buf->b_freeze_lock);
-	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+	ASSERT(HDR_EMPTY(hdr));
+	cv_destroy(&hdr->b_l1hdr.b_cv);
+	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	ASSERT(HDR_EMPTY(hdr));
+	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+}
+
+/* ARGSUSED */
+static void
 buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
@@ -1032,7 +1635,7 @@
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 }
 
 static void
@@ -1044,10 +1647,11 @@
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 64K block size.  The table will take up
-	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+	 * with an average block size of zfs_arc_average_blocksize (default 8K).
+	 * By default, the table will take up
+	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
-	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
+	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
@@ -1059,8 +1663,11 @@
 		goto retry;
 	}
 
-	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
@@ -1076,58 +1683,139 @@
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+	boolean_t shared = (buf->b_data != NULL &&
+	    buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+	return (shared);
+}
+
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+		hdr->b_l1hdr.b_freeze_cksum = NULL;
+	}
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum == NULL ||
-	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
-	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
-	zio_cksum_t zc;
-	int equal;
+	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+	boolean_t valid_cksum;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
-	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
-	return (equal);
+	/*
+	 * We rely on the blkptr's checksum to determine if the block
+	 * is valid or not. When compressed arc is enabled, the l2arc
+	 * writes the block to the l2arc just as it appears in the pool.
+	 * This allows us to use the blkptr's checksum to validate the
+	 * data that we just read off of the l2arc without having to store
+	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
+	 * arc is disabled, then the data written to the l2arc is always
+	 * uncompressed and won't match the block as it exists in the main
+	 * pool. When this is the case, we must first compress it if it is
+	 * compressed on the main pool before we can validate the checksum.
+	 */
+	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		uint64_t csize;
+
+		void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+		csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+		if (csize < HDR_GET_PSIZE(hdr)) {
+			/*
+			 * Compressed blocks are always a multiple of the
+			 * smallest ashift in the pool. Ideally, we would
+			 * like to round up the csize to the next
+			 * spa_min_ashift but that value may have changed
+			 * since the block was last written. Instead,
+			 * we rely on the fact that the hdr's psize
+			 * was set to the psize of the block when it was
+			 * last written. We set the csize to that value
+			 * and zero out any part that should not contain
+			 * data.
+			 */
+			bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+			csize = HDR_GET_PSIZE(hdr);
+		}
+		zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+	}
+
+	/*
+	 * Block pointers always store the checksum for the logical data.
+	 * If the block pointer has the gang bit set, then the checksum
+	 * it represents is for the reconstituted data and not for an
+	 * individual gang member. The zio pipeline, however, must be able to
+	 * determine the checksum of each of the gang constituents so it
+	 * treats the checksum comparison differently than what we need
+	 * for l2arc blocks. This prevents us from using the
+	 * zio_checksum_error() interface directly. Instead we must call the
+	 * zio_checksum_error_impl() so that we can ensure the checksum is
+	 * generated using the correct checksum algorithm and accounts for the
+	 * logical I/O size and not just a gang fragment.
+	 */
+	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+	    BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+	    zio->io_offset, NULL) == 0);
+	zio_pop_transforms(zio);
+	return (valid_cksum);
 }
 
 static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
 {
-	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
-	    buf->b_hdr->b_freeze_cksum);
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+	    KM_SLEEP);
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+	    hdr->b_l1hdr.b_freeze_cksum);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #ifdef illumos
 	arc_buf_watch(buf);
-#endif /* illumos */
+#endif
 }
 
 #ifdef illumos
@@ -1166,7 +1854,7 @@
 		procctl_t ctl;
 		ctl.cmd = PCWATCH;
 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
-		ctl.prwatch.pr_size = buf->b_hdr->b_size;
+		ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
 		ctl.prwatch.pr_wflags = WA_WRITE;
 		result = write(arc_procfd, &ctl, sizeof (ctl));
 		ASSERT3U(result, ==, sizeof (ctl));
@@ -1175,225 +1863,529 @@
 }
 #endif /* illumos */
 
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+	arc_buf_contents_t type;
+	if (HDR_ISTYPE_METADATA(hdr)) {
+		type = ARC_BUFC_METADATA;
+	} else {
+		type = ARC_BUFC_DATA;
+	}
+	VERIFY3U(hdr->b_type, ==, type);
+	return (type);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+	switch (type) {
+	case ARC_BUFC_DATA:
+		/* metadata field is 0 if buffer contains normal data */
+		return (0);
+	case ARC_BUFC_METADATA:
+		return (ARC_FLAG_BUFC_METADATA);
+	default:
+		break;
+	}
+	panic("undefined ARC buffer type!");
+	return ((uint32_t)-1);
+}
+
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_state != arc_anon)
+		if (hdr->b_l1hdr.b_state != arc_anon)
 			panic("modifying non-anon buffer!");
-		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+		if (HDR_IO_IN_PROGRESS(hdr))
 			panic("modifying buffer while i/o in progress!");
 		arc_cksum_verify(buf);
 	}
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		buf->b_hdr->b_freeze_cksum = NULL;
-	}
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	arc_cksum_free(hdr);
 
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef ZFS_DEBUG
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_thawed)
-			kmem_free(buf->b_hdr->b_thawed, 1);
-		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+		if (hdr->b_l1hdr.b_thawed != NULL)
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
 	}
+#endif
 
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 
 #ifdef illumos
 	arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	hash_lock = HDR_LOCK(buf->b_hdr);
+	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
-	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
-	    buf->b_hdr->b_state == arc_anon);
-	arc_cksum_compute(buf, B_FALSE);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+	    hdr->b_l1hdr.b_state == arc_anon);
+	arc_cksum_compute(buf);
 	mutex_exit(hash_lock);
 
 }
 
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
 static void
-get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
-	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
-	if (ab->b_type == ARC_BUFC_METADATA)
-		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
-	else {
-		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
-		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
+	/*
+	 * Holes and embedded blocks will always have a psize = 0 so
+	 * we ignore the compression of the blkptr and set the
+	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+	 * Holes and embedded blocks remain anonymous so we don't
+	 * want to uncompress them. Mark them as uncompressed.
+	 */
+	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, cmp);
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
+}
 
-	*list = &state->arcs_lists[buf_hashid];
-	*lock = ARCS_LOCK(state, buf_hashid);
+static int
+arc_decompress(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+	int error;
+
+	if (arc_buf_is_shared(buf)) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		/*
+		 * The arc_buf_hdr_t is either not compressed or is
+		 * associated with an embedded block or a hole in which
+		 * case they remain anonymous.
+		 */
+		IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
+		    HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+	} else {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+		error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
+		    HDR_GET_LSIZE(hdr));
+		if (error != 0) {
+			zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
+			    hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
+			    HDR_GET_LSIZE(hdr));
+			return (SET_ERROR(EIO));
+		}
+	}
+	if (bswap != DMU_BSWAP_NUMFUNCS) {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+	}
+	arc_cksum_compute(buf);
+	return (0);
 }
 
+/*
+ * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+	uint64_t size;
 
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    HDR_GET_PSIZE(hdr) > 0) {
+		size = HDR_GET_PSIZE(hdr);
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+		size = HDR_GET_LSIZE(hdr);
+	}
+	return (size);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
 static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
-	ASSERT(MUTEX_HELD(hash_lock));
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
 
-	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
-	    (ab->b_state != arc_anon)) {
-		uint64_t delta = ab->b_size * ab->b_datacnt;
-		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
-		list_t *list;
-		kmutex_t *lock;
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-		get_buf_info(ab, ab->b_state, &list, &lock);
-		ASSERT(!MUTEX_HELD(lock));
-		mutex_enter(lock);
-		ASSERT(list_link_active(&ab->b_arc_node));
-		list_remove(list, ab);
-		if (GHOST_STATE(ab->b_state)) {
-			ASSERT0(ab->b_datacnt);
-			ASSERT3P(ab->b_buf, ==, NULL);
-			delta = ab->b_size;
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		(void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		(void) refcount_add_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			continue;
 		}
-		ASSERT(delta > 0);
-		ASSERT3U(*size, >=, delta);
-		atomic_add_64(size, -delta);
-		mutex_exit(lock);
+		(void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+	}
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    lsize, hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			continue;
+		}
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    lsize, buf);
+	}
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	}
+
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+
+	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+	    (state != arc_anon)) {
+		/* We don't use the L2-only state list. */
+		if (state != arc_l2c_only) {
+			multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
+			    hdr);
+			arc_evitable_space_decrement(hdr, state);
+		}
 		/* remove the prefetch flag if we get a reference */
-		if (ab->b_flags & ARC_PREFETCH)
-			ab->b_flags &= ~ARC_PREFETCH;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 	}
 }
 
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
 static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
-	arc_state_t *state = ab->b_state;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
+	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
-	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+	/*
+	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
+	 * check to prevent usage of the arc_l2c_only list.
+	 */
+	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
-		uint64_t *size = &state->arcs_lsize[ab->b_type];
-		list_t *list;
-		kmutex_t *lock;
-
-		get_buf_info(ab, state, &list, &lock);
-		ASSERT(!MUTEX_HELD(lock));
-		mutex_enter(lock);
-		ASSERT(!list_link_active(&ab->b_arc_node));
-		list_insert_head(list, ab);
-		ASSERT(ab->b_datacnt > 0);
-		atomic_add_64(size, ab->b_size * ab->b_datacnt);
-		mutex_exit(lock);
+		multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+		arc_evictable_space_increment(hdr, state);
 	}
 	return (cnt);
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+    kmutex_t *hash_lock)
 {
-	arc_state_t *old_state = ab->b_state;
-	int64_t refcnt = refcount_count(&ab->b_refcnt);
-	uint64_t from_delta, to_delta;
-	list_t *list;
-	kmutex_t *lock;
+	arc_state_t *old_state;
+	int64_t refcnt;
+	uint32_t bufcnt;
+	boolean_t update_old, update_new;
+	arc_buf_contents_t buftype = arc_buf_type(hdr);
 
+	/*
+	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
+	 * L1 hdr doesn't always exist when we change state to arc_anon before
+	 * destroying a header, in which case reallocating to add the L1 hdr is
+	 * pointless.
+	 */
+	if (HDR_HAS_L1HDR(hdr)) {
+		old_state = hdr->b_l1hdr.b_state;
+		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+		bufcnt = hdr->b_l1hdr.b_bufcnt;
+		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+	} else {
+		old_state = arc_l2c_only;
+		refcnt = 0;
+		bufcnt = 0;
+		update_old = B_FALSE;
+	}
+	update_new = update_old;
+
 	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(new_state != old_state);
-	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
-	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
+	ASSERT3P(new_state, !=, old_state);
+	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+	ASSERT(old_state != arc_anon || bufcnt <= 1);
 
-	from_delta = to_delta = ab->b_datacnt * ab->b_size;
-
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
-		if (old_state != arc_anon) {
-			int use_mutex;
-			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+		if (old_state != arc_anon && old_state != arc_l2c_only) {
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_remove(&old_state->arcs_list[buftype], hdr);
 
-			get_buf_info(ab, old_state, &list, &lock);
-			use_mutex = !MUTEX_HELD(lock);
-			if (use_mutex)
-				mutex_enter(lock);
+			if (GHOST_STATE(old_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_old = B_TRUE;
+			}
+			arc_evitable_space_decrement(hdr, old_state);
+		}
+		if (new_state != arc_anon && new_state != arc_l2c_only) {
 
-			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(list, ab);
-
 			/*
-			 * If prefetching out of the ghost cache,
-			 * we will have a non-zero datacnt.
+			 * An L1 header always exists here, since if we're
+			 * moving to some L1-cached state (i.e. not l2c_only or
+			 * anonymous), we realloc the header to add an L1hdr
+			 * beforehand.
 			 */
-			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
-				/* ghost elements have a ghost size */
-				ASSERT(ab->b_buf == NULL);
-				from_delta = ab->b_size;
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_insert(&new_state->arcs_list[buftype], hdr);
+
+			if (GHOST_STATE(new_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_new = B_TRUE;
 			}
-			ASSERT3U(*size, >=, from_delta);
-			atomic_add_64(size, -from_delta);
-
-			if (use_mutex)
-				mutex_exit(lock);
+			arc_evictable_space_increment(hdr, new_state);
 		}
-		if (new_state != arc_anon) {
-			int use_mutex;
-			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+	}
 
-			get_buf_info(ab, new_state, &list, &lock);
-			use_mutex = !MUTEX_HELD(lock);
-			if (use_mutex)
-				mutex_enter(lock);
+	ASSERT(!HDR_EMPTY(hdr));
+	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+		buf_hash_remove(hdr);
 
-			list_insert_head(list, ab);
+	/* adjust state sizes (ignore arc_l2c_only) */
 
-			/* ghost elements have a ghost size */
-			if (GHOST_STATE(new_state)) {
-				ASSERT(ab->b_datacnt == 0);
-				ASSERT(ab->b_buf == NULL);
-				to_delta = ab->b_size;
+	if (update_new && new_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(new_state)) {
+			ASSERT0(bufcnt);
+
+			/*
+			 * When moving a header to a ghost state, we first
+			 * remove all arc buffers. Thus, we'll have a
+			 * bufcnt of zero, and no arc buffer to use for
+			 * the reference. As a result, we use the arc
+			 * header pointer for the reference.
+			 */
+			(void) refcount_add_many(&new_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3U(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf)) {
+					ASSERT(ARC_BUF_LAST(buf));
+					continue;
+				}
+
+				(void) refcount_add_many(&new_state->arcs_size,
+				    HDR_GET_LSIZE(hdr), buf);
 			}
-			atomic_add_64(size, to_delta);
+			ASSERT3U(bufcnt, ==, buffers);
 
-			if (use_mutex)
-				mutex_exit(lock);
+			if (hdr->b_l1hdr.b_pdata != NULL) {
+				(void) refcount_add_many(&new_state->arcs_size,
+				    arc_hdr_size(hdr), hdr);
+			} else {
+				ASSERT(GHOST_STATE(old_state));
+			}
 		}
 	}
 
-	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
-		buf_hash_remove(ab);
+	if (update_old && old_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(old_state)) {
+			ASSERT0(bufcnt);
 
-	/* adjust state sizes */
-	if (to_delta)
-		atomic_add_64(&new_state->arcs_size, to_delta);
-	if (from_delta) {
-		ASSERT3U(old_state->arcs_size, >=, from_delta);
-		atomic_add_64(&old_state->arcs_size, -from_delta);
+			/*
+			 * When moving a header off of a ghost state,
+			 * the header will not contain any arc buffers.
+			 * We use the arc header pointer for the reference
+			 * which is exactly what we did when we put the
+			 * header on the ghost state.
+			 */
+
+			(void) refcount_remove_many(&old_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3P(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf)) {
+					ASSERT(ARC_BUF_LAST(buf));
+					continue;
+				}
+
+				(void) refcount_remove_many(
+				    &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+				    buf);
+			}
+			ASSERT3U(bufcnt, ==, buffers);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			(void) refcount_remove_many(
+			    &old_state->arcs_size, arc_hdr_size(hdr), hdr);
+		}
 	}
-	ab->b_state = new_state;
 
-	/* adjust l2arc hdr stats */
-	if (new_state == arc_l2c_only)
-		l2arc_hdr_stat_add();
-	else if (old_state == arc_l2c_only)
-		l2arc_hdr_stat_remove();
+	if (HDR_HAS_L1HDR(hdr))
+		hdr->b_l1hdr.b_state = new_state;
+
+	/*
+	 * L2 headers should never be on the L2 state list since they don't
+	 * have L1 headers allocated.
+	 */
+	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1405,6 +2397,9 @@
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
+	case ARC_SPACE_META:
+		ARCSTAT_INCR(arcstat_metadata_size, space);
+		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, space);
 		break;
@@ -1416,7 +2411,9 @@
 		break;
 	}
 
-	atomic_add_64(&arc_meta_used, space);
+	if (type != ARC_SPACE_DATA)
+		ARCSTAT_INCR(arcstat_meta_used, space);
+
 	atomic_add_64(&arc_size, space);
 }
 
@@ -1429,6 +2426,9 @@
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
+	case ARC_SPACE_META:
+		ARCSTAT_INCR(arcstat_metadata_size, -space);
+		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, -space);
 		break;
@@ -1440,58 +2440,96 @@
 		break;
 	}
 
-	ASSERT(arc_meta_used >= space);
-	if (arc_meta_max < arc_meta_used)
-		arc_meta_max = arc_meta_used;
-	atomic_add_64(&arc_meta_used, -space);
+	if (type != ARC_SPACE_DATA) {
+		ASSERT(arc_meta_used >= space);
+		if (arc_meta_max < arc_meta_used)
+			arc_meta_max = arc_meta_used;
+		ARCSTAT_INCR(arcstat_meta_used, -space);
+	}
+
 	ASSERT(arc_size >= space);
 	atomic_add_64(&arc_size, -space);
 }
 
-void *
-arc_data_buf_alloc(uint64_t size)
+/*
+ * Allocate an initial buffer for this hdr, subsequent buffers will
+ * use arc_buf_clone().
+ */
+static arc_buf_t *
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
 {
-	if (arc_evict_needed(ARC_BUFC_DATA))
-		cv_signal(&arc_reclaim_thr_cv);
-	atomic_add_64(&arc_size, size);
-	return (zio_data_buf_alloc(size));
-}
+	arc_buf_t *buf;
 
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
-	zio_data_buf_free(buf, size);
-	ASSERT(arc_size >= size);
-	atomic_add_64(&arc_size, -size);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+	    hdr->b_type == ARC_BUFC_METADATA);
+
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	ASSERT0(hdr->b_l1hdr.b_bufcnt);
+
+	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+	buf->b_hdr = hdr;
+	buf->b_data = NULL;
+	buf->b_next = NULL;
+
+	add_reference(hdr, tag);
+
+	/*
+	 * We're about to change the hdr's b_flags. We must either
+	 * hold the hash_lock or be undiscoverable.
+	 */
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * If the hdr's data can be shared (no byteswapping, hdr is
+	 * uncompressed, hdr's data is not currently being written to the
+	 * L2ARC write) then we share the data buffer and set the appropriate
+	 * bit in the hdr's b_flags to indicate the hdr is sharing it's
+	 * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
+	 * store the buf's data.
+	 */
+	if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+	    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+		buf->b_data = hdr->b_l1hdr.b_pdata;
+		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+	} else {
+		buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+		ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+		arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+	}
+	VERIFY3P(buf->b_data, !=, NULL);
+
+	hdr->b_l1hdr.b_buf = buf;
+	hdr->b_l1hdr.b_bufcnt += 1;
+
+	return (buf);
 }
 
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+/*
+ * Used when allocating additional buffers.
+ */
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
 {
-	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
+	arc_buf_hdr_t *hdr = from->b_hdr;
+	uint64_t size = HDR_GET_LSIZE(hdr);
 
-	ASSERT3U(size, >, 0);
-	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-	ASSERT(BUF_EMPTY(hdr));
-	hdr->b_size = size;
-	hdr->b_type = type;
-	hdr->b_spa = spa_load_guid(spa);
-	hdr->b_state = arc_anon;
-	hdr->b_arc_access = 0;
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
+
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = NULL;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	hdr->b_datacnt = 1;
-	hdr->b_flags = 0;
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	(void) refcount_add(&hdr->b_refcnt, tag);
+	buf->b_next = hdr->b_l1hdr.b_buf;
+	hdr->b_l1hdr.b_buf = buf;
+	buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+	bcopy(from->b_data, buf->b_data, size);
+	hdr->b_l1hdr.b_bufcnt += 1;
 
+	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 	return (buf);
 }
 
@@ -1508,7 +2546,7 @@
 {
 	arc_buf_t *buf;
 
-	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+	buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
 
 	atomic_add_64(&arc_loaned_bytes, size);
 	return (buf);
@@ -1522,11 +2560,12 @@
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(buf->b_data != NULL);
-	(void) refcount_add(&hdr->b_refcnt, tag);
-	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
-	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+	atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
@@ -1533,173 +2572,213 @@
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
-	arc_buf_hdr_t *hdr;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(buf->b_data != NULL);
-	hdr = buf->b_hdr;
-	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
-	(void) refcount_remove(&hdr->b_refcnt, tag);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
-	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+	atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
 }
 
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
+static void
+l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
 {
-	arc_buf_t *buf;
-	arc_buf_hdr_t *hdr = from->b_hdr;
-	uint64_t size = hdr->b_size;
+	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
-	ASSERT(hdr->b_state != arc_anon);
+	df->l2df_data = data;
+	df->l2df_size = size;
+	df->l2df_type = type;
+	mutex_enter(&l2arc_free_on_write_mtx);
+	list_insert_head(l2arc_free_on_write, df);
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
 
-	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
-	buf->b_hdr = hdr;
-	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = hdr->b_buf;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	bcopy(from->b_data, buf->b_data, size);
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t size = arc_hdr_size(hdr);
 
-	/*
-	 * This buffer already exists in the arc so create a duplicate
-	 * copy for the caller.  If the buffer is associated with user data
-	 * then track the size and number of duplicates.  These stats will be
-	 * updated as duplicate buffers are created and destroyed.
-	 */
-	if (hdr->b_type == ARC_BUFC_DATA) {
-		ARCSTAT_BUMP(arcstat_duplicate_buffers);
-		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    size, hdr);
 	}
-	hdr->b_datacnt += 1;
-	return (buf);
+	(void) refcount_remove_many(&state->arcs_size, size, hdr);
+	if (type == ARC_BUFC_METADATA) {
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		arc_space_return(size, ARC_SPACE_DATA);
+	}
+
+	l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
 }
 
-void
-arc_buf_add_ref(arc_buf_t *buf, void* tag)
+/*
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
+ */
+static void
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
+	ASSERT(!HDR_SHARED_DATA(hdr));
+	ASSERT(!arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
 	/*
-	 * Check to see if this buffer is evicted.  Callers
-	 * must verify b_data != NULL to know if the add_ref
-	 * was successful.
+	 * Start sharing the data buffer. We transfer the
+	 * refcount ownership to the hdr since it always owns
+	 * the refcount whenever an arc_buf_t is shared.
 	 */
-	mutex_enter(&buf->b_evict_lock);
-	if (buf->b_data == NULL) {
-		mutex_exit(&buf->b_evict_lock);
-		return;
-	}
-	hash_lock = HDR_LOCK(buf->b_hdr);
-	mutex_enter(hash_lock);
-	hdr = buf->b_hdr;
-	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-	mutex_exit(&buf->b_evict_lock);
+	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
+	hdr->b_l1hdr.b_pdata = buf->b_data;
+	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-	add_reference(hdr, hash_lock, tag);
-	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-	arc_access(hdr, hash_lock);
-	mutex_exit(hash_lock);
-	ARCSTAT_BUMP(arcstat_hits);
-	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
-	    data, metadata, hits);
+	/*
+	 * Since we've transferred ownership to the hdr we need
+	 * to increment its compressed and uncompressed kstats and
+	 * decrement the overhead size.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
 }
 
-/*
- * Free the arc data buffer.  If it is an l2arc write in progress,
- * the buffer is placed on l2arc_free_on_write to be freed later.
- */
 static void
-arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
-	arc_buf_hdr_t *hdr = buf->b_hdr;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
-	if (HDR_L2_WRITING(hdr)) {
-		l2arc_data_free_t *df;
-		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
-		df->l2df_data = buf->b_data;
-		df->l2df_size = hdr->b_size;
-		df->l2df_func = free_func;
-		mutex_enter(&l2arc_free_on_write_mtx);
-		list_insert_head(l2arc_free_on_write, df);
-		mutex_exit(&l2arc_free_on_write_mtx);
-		ARCSTAT_BUMP(arcstat_l2_free_on_write);
-	} else {
-		free_func(buf->b_data, hdr->b_size);
-	}
+	ASSERT(HDR_SHARED_DATA(hdr));
+	ASSERT(arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * We are no longer sharing this buffer so we need
+	 * to transfer its ownership to the rightful owner.
+	 */
+	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+	hdr->b_l1hdr.b_pdata = NULL;
+
+	/*
+	 * Since the buffer is no longer shared between
+	 * the arc buf and the hdr, count it as overhead.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 }
 
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
 {
 	arc_buf_t **bufp;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	uint64_t size = HDR_GET_LSIZE(hdr);
+	boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
 
-	/* free up data associated with the buf */
-	if (buf->b_data) {
-		arc_state_t *state = buf->b_hdr->b_state;
-		uint64_t size = buf->b_hdr->b_size;
-		arc_buf_contents_t type = buf->b_hdr->b_type;
+	/*
+	 * Free up the data associated with the buf but only
+	 * if we're not sharing this with the hdr. If we are sharing
+	 * it with the hdr, then hdr will have performed the allocation
+	 * so allow it to do the free.
+	 */
+	if (buf->b_data != NULL) {
+		/*
+		 * We're about to change the hdr's b_flags. We must either
+		 * hold the hash_lock or be undiscoverable.
+		 */
+		ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
 		arc_cksum_verify(buf);
 #ifdef illumos
 		arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
 
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				arc_buf_data_free(buf, zio_buf_free);
-				arc_space_return(size, ARC_SPACE_DATA);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				arc_buf_data_free(buf, zio_data_buf_free);
-				ARCSTAT_INCR(arcstat_data_size, -size);
-				atomic_add_64(&arc_size, -size);
-			}
+		if (destroyed_buf_is_shared) {
+			ASSERT(ARC_BUF_LAST(buf));
+			ASSERT(HDR_SHARED_DATA(hdr));
+			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+		} else {
+			arc_free_data_buf(hdr, buf->b_data, size, buf);
+			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
-		if (list_link_active(&buf->b_hdr->b_arc_node)) {
-			uint64_t *cnt = &state->arcs_lsize[type];
-
-			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
-			ASSERT(state != arc_anon);
-
-			ASSERT3U(*cnt, >=, size);
-			atomic_add_64(cnt, -size);
-		}
-		ASSERT3U(state->arcs_size, >=, size);
-		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
 
-		/*
-		 * If we're destroying a duplicate buffer make sure
-		 * that the appropriate statistics are updated.
-		 */
-		if (buf->b_hdr->b_datacnt > 1 &&
-		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
-			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
-			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
-		}
-		ASSERT(buf->b_hdr->b_datacnt > 0);
-		buf->b_hdr->b_datacnt -= 1;
+		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+		hdr->b_l1hdr.b_bufcnt -= 1;
 	}
 
 	/* only remove the buf if requested */
-	if (!all)
+	if (!remove)
 		return;
 
 	/* remove the buf from the hdr list */
-	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
-		continue;
-	*bufp = buf->b_next;
+	arc_buf_t *lastbuf = NULL;
+	bufp = &hdr->b_l1hdr.b_buf;
+	while (*bufp != NULL) {
+		if (*bufp == buf)
+			*bufp = buf->b_next;
+
+		/*
+		 * If we've removed a buffer in the middle of
+		 * the list then update the lastbuf and update
+		 * bufp.
+		 */
+		if (*bufp != NULL) {
+			lastbuf = *bufp;
+			bufp = &(*bufp)->b_next;
+		}
+	}
 	buf->b_next = NULL;
+	ASSERT3P(lastbuf, !=, buf);
 
-	ASSERT(buf->b_efunc == NULL);
+	/*
+	 * If the current arc_buf_t is sharing its data
+	 * buffer with the hdr, then reassign the hdr's
+	 * b_pdata to share it with the new buffer at the end
+	 * of the list. The shared buffer is always the last one
+	 * on the hdr's buffer list.
+	 */
+	if (destroyed_buf_is_shared && lastbuf != NULL) {
+		ASSERT(ARC_BUF_LAST(buf));
+		ASSERT(ARC_BUF_LAST(lastbuf));
+		VERIFY(!arc_buf_is_shared(lastbuf));
 
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		arc_hdr_free_pdata(hdr);
+
+		/*
+		 * We must setup a new shared block between the
+		 * last buffer and the hdr. The data would have
+		 * been allocated by the arc buf so we need to transfer
+		 * ownership to the hdr since it's now being shared.
+		 */
+		arc_share_buf(hdr, lastbuf);
+	} else if (HDR_SHARED_DATA(hdr)) {
+		ASSERT(arc_buf_is_shared(lastbuf));
+	}
+
+	if (hdr->b_l1hdr.b_bufcnt == 0)
+		arc_cksum_free(hdr);
+
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
@@ -1706,668 +2785,1026 @@
 }
 
 static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
 {
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!HDR_SHARED_DATA(hdr));
 
-	if (l2hdr != NULL) {
-		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+	/*
+	 * If the hdr is currently being written to the l2arc then
+	 * we defer freeing the data by adding it to the l2arc_free_on_write
+	 * list. The l2arc will free the data once it's finished
+	 * writing it to the l2arc device.
+	 */
+	if (HDR_L2_WRITING(hdr)) {
+		arc_hdr_free_on_write(hdr);
+		ARCSTAT_BUMP(arcstat_l2_free_on_write);
+	} else {
+		arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+		    arc_hdr_size(hdr), hdr);
+	}
+	hdr->b_l1hdr.b_pdata = NULL;
+	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+    enum zio_compress compress, arc_buf_contents_t type)
+{
+	arc_buf_hdr_t *hdr;
+
+	ASSERT3U(lsize, >, 0);
+	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+
+	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+	ASSERT(HDR_EMPTY(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
+	HDR_SET_PSIZE(hdr, psize);
+	HDR_SET_LSIZE(hdr, lsize);
+	hdr->b_spa = spa;
+	hdr->b_type = type;
+	hdr->b_flags = 0;
+	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+	arc_hdr_set_compress(hdr, compress);
+
+	hdr->b_l1hdr.b_state = arc_anon;
+	hdr->b_l1hdr.b_arc_access = 0;
+	hdr->b_l1hdr.b_bufcnt = 0;
+	hdr->b_l1hdr.b_buf = NULL;
+
+	/*
+	 * Allocate the hdr's buffer. This will contain either
+	 * the compressed or uncompressed data depending on the block
+	 * it references and compressed arc enablement.
+	 */
+	arc_hdr_alloc_pdata(hdr);
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+	return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	arc_buf_hdr_t *nhdr;
+	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+	    (old == hdr_l2only_cache && new == hdr_full_cache));
+
+	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+	buf_hash_remove(hdr);
+
+	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+	if (new == hdr_full_cache) {
+		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
-		 * To prevent arc_free() and l2arc_evict() from
-		 * attempting to free the same buffer at the same time,
-		 * a FREE_IN_PROGRESS flag is given to arc_free() to
-		 * give it priority.  l2arc_evict() can't destroy this
-		 * header while we are waiting on l2arc_buflist_mtx.
-		 *
-		 * The hdr may be removed from l2ad_buflist before we
-		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+		 * arc_access and arc_change_state need to be aware that a
+		 * header has just come out of L2ARC, so we set its state to
+		 * l2c_only even though it's about to change.
 		 */
-		if (!buflist_held) {
-			mutex_enter(&l2arc_buflist_mtx);
-			l2hdr = hdr->b_l2hdr;
-		}
+		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
-		if (l2hdr != NULL) {
-			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
-			    hdr->b_size, 0);
-			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
-			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
-			if (hdr->b_state == arc_l2c_only)
-				l2arc_hdr_stat_remove();
-			hdr->b_l2hdr = NULL;
-		}
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+	} else {
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 
-		if (!buflist_held)
-			mutex_exit(&l2arc_buflist_mtx);
-	}
+		/*
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
+		 */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
-	if (!BUF_EMPTY(hdr)) {
-		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		buf_discard_identity(hdr);
-	}
-	while (hdr->b_buf) {
-		arc_buf_t *buf = hdr->b_buf;
+		/*
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+		 * might try to be accessed, even though it was removed.
+		 */
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
 
-		if (buf->b_efunc) {
-			mutex_enter(&arc_eviction_mtx);
-			mutex_enter(&buf->b_evict_lock);
-			ASSERT(buf->b_hdr != NULL);
-			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
-			hdr->b_buf = buf->b_next;
-			buf->b_hdr = &arc_eviction_hdr;
-			buf->b_next = arc_eviction_list;
-			arc_eviction_list = buf;
-			mutex_exit(&buf->b_evict_lock);
-			mutex_exit(&arc_eviction_mtx);
-		} else {
-			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+#ifdef ZFS_DEBUG
+		if (hdr->b_l1hdr.b_thawed != NULL) {
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+			hdr->b_l1hdr.b_thawed = NULL;
 		}
+#endif
+
+		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
-	if (hdr->b_freeze_cksum != NULL) {
-		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		hdr->b_freeze_cksum = NULL;
-	}
-	if (hdr->b_thawed) {
-		kmem_free(hdr->b_thawed, 1);
-		hdr->b_thawed = NULL;
-	}
+	/*
+	 * The header has been reallocated so we need to re-insert it into any
+	 * lists it was on.
+	 */
+	(void) buf_hash_insert(nhdr, NULL);
 
-	ASSERT(!list_link_active(&hdr->b_arc_node));
-	ASSERT3P(hdr->b_hash_next, ==, NULL);
-	ASSERT3P(hdr->b_acb, ==, NULL);
-	kmem_cache_free(hdr_cache, hdr);
+	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+	mutex_enter(&dev->l2ad_mtx);
+
+	/*
+	 * We must place the realloc'ed header back into the list at
+	 * the same spot. Otherwise, if it's placed earlier in the list,
+	 * l2arc_write_buffers() could find it during the function's
+	 * write phase, and try to write it out to the l2arc.
+	 */
+	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	mutex_exit(&dev->l2ad_mtx);
+
+	/*
+	 * Since we're using the pointer address as the tag when
+	 * incrementing and decrementing the l2ad_alloc refcount, we
+	 * must remove the old pointer (that we're about to destroy) and
+	 * add the new pointer to the refcount. Otherwise we'd remove
+	 * the wrong pointer address when calling arc_hdr_destroy() later.
+	 */
+
+	(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+	(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
+
+	buf_discard_identity(hdr);
+	kmem_cache_free(old, hdr);
+
+	return (nhdr);
 }
 
-void
-arc_buf_free(arc_buf_t *buf, void *tag)
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
 {
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	int hashed = hdr->b_state != arc_anon;
+	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+	    ZIO_COMPRESS_OFF, type);
+	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+	arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
+	arc_buf_thaw(buf);
+	return (buf);
+}
 
-	ASSERT(buf->b_efunc == NULL);
-	ASSERT(buf->b_data != NULL);
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+	l2arc_dev_t *dev = l2hdr->b_dev;
+	uint64_t asize = arc_hdr_size(hdr);
 
-	if (hashed) {
-		kmutex_t *hash_lock = HDR_LOCK(hdr);
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+	ASSERT(HDR_HAS_L2HDR(hdr));
 
-		mutex_enter(hash_lock);
-		hdr = buf->b_hdr;
-		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	list_remove(&dev->l2ad_buflist, hdr);
 
-		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1) {
-			arc_buf_destroy(buf, FALSE, TRUE);
-		} else {
-			ASSERT(buf == hdr->b_buf);
-			ASSERT(buf->b_efunc == NULL);
-			hdr->b_flags |= ARC_BUF_AVAILABLE;
-		}
-		mutex_exit(hash_lock);
-	} else if (HDR_IO_IN_PROGRESS(hdr)) {
-		int destroy_hdr;
+	ARCSTAT_INCR(arcstat_l2_asize, -asize);
+	ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+
+	(void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+		    hdr->b_l1hdr.b_bufcnt > 0);
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+	}
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (!HDR_EMPTY(hdr))
+		buf_discard_identity(hdr);
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+		if (!buflist_held)
+			mutex_enter(&dev->l2ad_mtx);
+
 		/*
-		 * We are in the middle of an async write.  Don't destroy
-		 * this buffer unless the write completes before we finish
-		 * decrementing the reference count.
+		 * Even though we checked this conditional above, we
+		 * need to check this again now that we have the
+		 * l2ad_mtx. This is because we could be racing with
+		 * another thread calling l2arc_evict() which might have
+		 * destroyed this header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx. If that happens, we don't
+		 * want to re-destroy the header's L2 portion.
 		 */
-		mutex_enter(&arc_eviction_mtx);
-		(void) remove_reference(hdr, NULL, tag);
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
+		if (HDR_HAS_L2HDR(hdr)) {
+			l2arc_trim(hdr);
+			arc_hdr_l2hdr_destroy(hdr);
+		}
+
+		if (!buflist_held)
+			mutex_exit(&dev->l2ad_mtx);
+	}
+
+	if (HDR_HAS_L1HDR(hdr)) {
+		arc_cksum_free(hdr);
+
+		while (hdr->b_l1hdr.b_buf != NULL)
+			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
+
+#ifdef ZFS_DEBUG
+		if (hdr->b_l1hdr.b_thawed != NULL) {
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+			hdr->b_l1hdr.b_thawed = NULL;
+		}
+#endif
+
+		if (hdr->b_l1hdr.b_pdata != NULL) {
+			arc_hdr_free_pdata(hdr);
+		}
+	}
+
+	ASSERT3P(hdr->b_hash_next, ==, NULL);
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
-		if (remove_reference(hdr, NULL, tag) > 0)
-			arc_buf_destroy(buf, FALSE, TRUE);
-		else
-			arc_hdr_destroy(hdr);
+		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
-boolean_t
-arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+void
+arc_buf_destroy(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	boolean_t no_callback = (buf->b_efunc == NULL);
 
-	if (hdr->b_state == arc_anon) {
-		ASSERT(hdr->b_datacnt == 1);
-		arc_buf_free(buf, tag);
-		return (no_callback);
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		VERIFY0(remove_reference(hdr, NULL, tag));
+		arc_hdr_destroy(hdr);
+		return;
 	}
 
 	mutex_enter(hash_lock);
-	hdr = buf->b_hdr;
+	ASSERT3P(hdr, ==, buf->b_hdr);
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-	ASSERT(hdr->b_state != arc_anon);
-	ASSERT(buf->b_data != NULL);
+	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+	ASSERT3P(buf->b_data, !=, NULL);
 
 	(void) remove_reference(hdr, hash_lock, tag);
-	if (hdr->b_datacnt > 1) {
-		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
-	} else if (no_callback) {
-		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
-		ASSERT(buf->b_efunc == NULL);
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
-	}
-	ASSERT(no_callback || hdr->b_datacnt > 1 ||
-	    refcount_is_zero(&hdr->b_refcnt));
+	arc_buf_destroy_impl(buf, B_TRUE);
 	mutex_exit(hash_lock);
-	return (no_callback);
 }
 
-int
+int32_t
 arc_buf_size(arc_buf_t *buf)
 {
-	return (buf->b_hdr->b_size);
+	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
- * Called from the DMU to determine if the current buffer should be
- * evicted. In order to ensure proper locking, the eviction must be initiated
- * from the DMU. Return true if the buffer is associated with user data and
- * duplicate buffers still exist.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
+ *
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-boolean_t
-arc_buf_eviction_needed(arc_buf_t *buf)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-	arc_buf_hdr_t *hdr;
-	boolean_t evict_needed = B_FALSE;
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
 
-	if (zfs_disable_dup_eviction)
-		return (B_FALSE);
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	mutex_enter(&buf->b_evict_lock);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+
 		/*
-		 * We are in arc_do_user_evicts(); let that function
-		 * perform the eviction.
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. its b_pdata field) during its write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing it's L1 piece) until the header is
+		 * done being written to the l2arc.
 		 */
-		ASSERT(buf->b_data == NULL);
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
+		}
+
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += HDR_GET_LSIZE(hdr);
+
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		if (HDR_HAS_L2HDR(hdr)) {
+			ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+			/*
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
+			 */
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			/*
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
+			 */
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
+		} else {
+			ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
+		}
+		return (bytes_evicted);
+	}
+
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    arc_min_prefetch_lifespan)) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
+	}
+
+	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
+		}
+		if (buf->b_data != NULL)
+			bytes_evicted += HDR_GET_LSIZE(hdr);
 		mutex_exit(&buf->b_evict_lock);
-		return (B_FALSE);
-	} else if (buf->b_data == NULL) {
+		arc_buf_destroy_impl(buf, B_TRUE);
+	}
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
+	} else {
+		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+			ARCSTAT_INCR(arcstat_evict_l2_eligible,
+			    HDR_GET_LSIZE(hdr));
+		} else {
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+			    HDR_GET_LSIZE(hdr));
+		}
+	}
+
+	if (hdr->b_l1hdr.b_bufcnt == 0) {
+		arc_cksum_free(hdr);
+
+		bytes_evicted += arc_hdr_size(hdr);
+
 		/*
-		 * We have already been added to the arc eviction list;
-		 * recommend eviction.
+		 * If this hdr is being evicted and has a compressed
+		 * buffer then we discard it here before we change states.
+		 * This ensures that the accounting is updated correctly
+		 * in arc_free_data_buf().
 		 */
-		ASSERT3P(hdr, ==, &arc_eviction_hdr);
-		mutex_exit(&buf->b_evict_lock);
-		return (B_TRUE);
+		arc_hdr_free_pdata(hdr);
+
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	}
 
-	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
-		evict_needed = B_TRUE;
+	return (bytes_evicted);
+}
 
-	mutex_exit(&buf->b_evict_lock);
-	return (evict_needed);
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
+{
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	int evict_count = 0;
+
+	ASSERT3P(marker, !=, NULL);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	mls = multilist_sublist_lock(ml, idx);
+
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
+
+		/*
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
+		 */
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
+		 */
+		if (hdr->b_spa == 0)
+			continue;
+
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
+			continue;
+		}
+
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
+		 */
+		ASSERT(!MUTEX_HELD(hash_lock));
+
+		if (mutex_tryenter(hash_lock)) {
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			mutex_exit(hash_lock);
+
+			bytes_evicted += evicted;
+
+			/*
+			 * If evicted is zero, arc_evict_hdr() must have
+			 * decided to skip this header, don't increment
+			 * evict_count in this case.
+			 */
+			if (evicted != 0)
+				evict_count++;
+
+			/*
+			 * If arc_size isn't overflowing, signal any
+			 * threads that might happen to be waiting.
+			 *
+			 * For each header evicted, we wake up a single
+			 * thread. If we used cv_broadcast, we could
+			 * wake up "too many" threads causing arc_size
+			 * to significantly overflow arc_c; since
+			 * arc_get_data_buf() doesn't check for overflow
+			 * when it's woken up (it doesn't because it's
+			 * possible for the ARC to be overflowing while
+			 * full of un-evictable buffers, and the
+			 * function should proceed in this case).
+			 *
+			 * If threads are left sleeping, due to not
+			 * using cv_broadcast, they will be woken up
+			 * just before arc_reclaim_thread() sleeps.
+			 */
+			mutex_enter(&arc_reclaim_lock);
+			if (!arc_is_overflowing())
+				cv_signal(&arc_reclaim_waiters_cv);
+			mutex_exit(&arc_reclaim_lock);
+		} else {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+		}
+	}
+
+	multilist_sublist_unlock(mls);
+
+	return (bytes_evicted);
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
     arc_buf_contents_t type)
 {
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	int64_t bytes_remaining;
-	arc_buf_hdr_t *ab, *ab_prev = NULL;
-	list_t *evicted_list, *list, *evicted_list_start, *list_start;
-	kmutex_t *lock, *evicted_lock;
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-	static int evict_metadata_offset, evict_data_offset;
-	int i, idx, offset, list_count, count;
+	uint64_t total_evicted = 0;
+	multilist_t *ml = &state->arcs_list[type];
+	int num_sublists;
+	arc_buf_hdr_t **markers;
 
-	ASSERT(state == arc_mru || state == arc_mfu);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	num_sublists = multilist_get_num_sublists(ml);
 
-	if (type == ARC_BUFC_METADATA) {
-		offset = 0;
-		list_count = ARC_BUFC_NUMMETADATALISTS;
-		list_start = &state->arcs_lists[0];
-		evicted_list_start = &evicted_state->arcs_lists[0];
-		idx = evict_metadata_offset;
-	} else {
-		offset = ARC_BUFC_NUMMETADATALISTS;
-		list_start = &state->arcs_lists[offset];
-		evicted_list_start = &evicted_state->arcs_lists[offset];
-		list_count = ARC_BUFC_NUMDATALISTS;
-		idx = evict_data_offset;
+	/*
+	 * If we've tried to evict from each sublist, made some
+	 * progress, but still have not hit the target number of bytes
+	 * to evict, we want to keep trying. The markers allow us to
+	 * pick up where we left off for each individual sublist, rather
+	 * than starting from the tail each time.
+	 */
+	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	for (int i = 0; i < num_sublists; i++) {
+		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+		/*
+		 * A b_spa of 0 is used to indicate that this header is
+		 * a marker. This fact is used in arc_adjust_type() and
+		 * arc_evict_state_impl().
+		 */
+		markers[i]->b_spa = 0;
+
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_insert_tail(mls, markers[i]);
+		multilist_sublist_unlock(mls);
 	}
-	bytes_remaining = evicted_state->arcs_lsize[type];
-	count = 0;
 
-evict_start:
-	list = &list_start[idx];
-	evicted_list = &evicted_list_start[idx];
-	lock = ARCS_LOCK(state, (offset + idx));
-	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
+	/*
+	 * While we haven't hit our target number of bytes to evict, or
+	 * we're evicting all available buffers.
+	 */
+	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+		/*
+		 * Start eviction using a randomly selected sublist,
+		 * this is to try and evenly balance eviction across all
+		 * sublists. Always starting at the same sublist
+		 * (e.g. index 0) would cause evictions to favor certain
+		 * sublists over others.
+		 */
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
 
-	mutex_enter(lock);
-	mutex_enter(evicted_lock);
+		for (int i = 0; i < num_sublists; i++) {
+			uint64_t bytes_remaining;
+			uint64_t bytes_evicted;
 
-	for (ab = list_tail(list); ab; ab = ab_prev) {
-		ab_prev = list_prev(list, ab);
-		bytes_remaining -= (ab->b_size * ab->b_datacnt);
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(ab) ||
-		    (spa && ab->b_spa != spa) ||
-		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-		    ddi_get_lbolt() - ab->b_arc_access <
-		    arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
+			if (bytes == ARC_EVICT_ALL)
+				bytes_remaining = ARC_EVICT_ALL;
+			else if (total_evicted < bytes)
+				bytes_remaining = bytes - total_evicted;
+			else
+				break;
+
+			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+			    markers[sublist_idx], spa, bytes_remaining);
+
+			scan_evicted += bytes_evicted;
+			total_evicted += bytes_evicted;
+
+			/* we've reached the end, wrap to the beginning */
+			if (++sublist_idx >= num_sublists)
+				sublist_idx = 0;
 		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && ab->b_size != bytes &&
-		    ab_prev && ab_prev->b_size == bytes)
-			continue;
-		hash_lock = HDR_LOCK(ab);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT0(refcount_count(&ab->b_refcnt));
-			ASSERT(ab->b_datacnt > 0);
-			while (ab->b_buf) {
-				arc_buf_t *buf = ab->b_buf;
-				if (!mutex_tryenter(&buf->b_evict_lock)) {
-					missed += 1;
-					break;
-				}
-				if (buf->b_data) {
-					bytes_evicted += ab->b_size;
-					if (recycle && ab->b_type == type &&
-					    ab->b_size == bytes &&
-					    !HDR_L2_WRITING(ab)) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					ab->b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-					mutex_exit(&buf->b_evict_lock);
-				} else {
-					mutex_exit(&buf->b_evict_lock);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
 
-			if (ab->b_l2hdr) {
-				ARCSTAT_INCR(arcstat_evict_l2_cached,
-				    ab->b_size);
-			} else {
-				if (l2arc_write_eligible(ab->b_spa, ab)) {
-					ARCSTAT_INCR(arcstat_evict_l2_eligible,
-					    ab->b_size);
-				} else {
-					ARCSTAT_INCR(
-					    arcstat_evict_l2_ineligible,
-					    ab->b_size);
-				}
+		/*
+		 * If we didn't evict anything during this scan, we have
+		 * no reason to believe we'll evict more during another
+		 * scan, so break the loop.
+		 */
+		if (scan_evicted == 0) {
+			/* This isn't possible, let's make that obvious */
+			ASSERT3S(bytes, !=, 0);
+
+			/*
+			 * When bytes is ARC_EVICT_ALL, the only way to
+			 * break the loop is when scan_evicted is zero.
+			 * In that case, we actually have evicted enough,
+			 * so we don't want to increment the kstat.
+			 */
+			if (bytes != ARC_EVICT_ALL) {
+				ASSERT3S(total_evicted, <, bytes);
+				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
-			if (ab->b_datacnt == 0) {
-				arc_change_state(evicted_state, ab, hash_lock);
-				ASSERT(HDR_IN_HASH_TABLE(ab));
-				ab->b_flags |= ARC_IN_HASH_TABLE;
-				ab->b_flags &= ~ARC_BUF_AVAILABLE;
-				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-			}
-			if (!have_lock)
-				mutex_exit(hash_lock);
-			if (bytes >= 0 && bytes_evicted >= bytes)
-				break;
-			if (bytes_remaining > 0) {
-				mutex_exit(evicted_lock);
-				mutex_exit(lock);
-				idx  = ((idx + 1) & (list_count - 1));
-				count++;
-				goto evict_start;
-			}
-		} else {
-			missed += 1;
+			break;
 		}
 	}
 
-	mutex_exit(evicted_lock);
-	mutex_exit(lock);
+	for (int i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_remove(mls, markers[i]);
+		multilist_sublist_unlock(mls);
 
-	idx  = ((idx + 1) & (list_count - 1));
-	count++;
-
-	if (bytes_evicted < bytes) {
-		if (count < list_count)
-			goto evict_start;
-		else
-			dprintf("only evicted %lld bytes from %x",
-			    (longlong_t)bytes_evicted, state);
+		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
-	if (type == ARC_BUFC_METADATA)
-		evict_metadata_offset = idx;
-	else
-		evict_data_offset = idx;
+	kmem_free(markers, sizeof (*markers) * num_sublists);
 
-	if (skipped)
-		ARCSTAT_INCR(arcstat_evict_skip, skipped);
+	return (total_evicted);
+}
 
-	if (missed)
-		ARCSTAT_INCR(arcstat_mutex_miss, missed);
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
+{
+	uint64_t evicted = 0;
 
-	/*
-	 * We have just evicted some data into the ghost state, make
-	 * sure we also adjust the ghost state size if necessary.
-	 */
-	if (arc_no_grow &&
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-		    arc_mru_ghost->arcs_size - arc_c;
+	while (refcount_count(&state->arcs_esize[type]) != 0) {
+		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
 
-		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete =
-			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, 0, todelete);
-		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-			    arc_mru_ghost->arcs_size +
-			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
-		}
+		if (!retry)
+			break;
 	}
-	if (stolen)
-		ARCSTAT_BUMP(arcstat_stolen);
 
-	return (stolen);
+	return (evicted);
 }
 
 /*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
  */
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
-	arc_buf_hdr_t *ab, *ab_prev;
-	arc_buf_hdr_t marker = { 0 };
-	list_t *list, *list_start;
-	kmutex_t *hash_lock, *lock;
-	uint64_t bytes_deleted = 0;
-	uint64_t bufs_skipped = 0;
-	static int evict_offset;
-	int list_count, idx = evict_offset;
-	int offset, count = 0;
+	int64_t delta;
 
-	ASSERT(GHOST_STATE(state));
+	if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
+		delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
+		return (arc_evict_state(state, spa, delta, type));
+	}
 
+	return (0);
+}
+
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+	uint64_t total_evicted = 0;
+	int64_t target;
+
 	/*
-	 * data lists come after metadata lists
+	 * If we're over the meta limit, we want to evict enough
+	 * metadata to get back under the meta limit. We don't want to
+	 * evict so much that we drop the MRU below arc_p, though. If
+	 * we're over the meta limit more than we're over arc_p, we
+	 * evict some from the MRU here, and some from the MFU below.
 	 */
-	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
-	list_count = ARC_BUFC_NUMDATALISTS;
-	offset = ARC_BUFC_NUMMETADATALISTS;
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) - arc_p));
 
-evict_start:
-	list = &list_start[idx];
-	lock = ARCS_LOCK(state, idx + offset);
+	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 
-	mutex_enter(lock);
-	for (ab = list_tail(list); ab; ab = ab_prev) {
-		ab_prev = list_prev(list, ab);
-		if (spa && ab->b_spa != spa)
-			continue;
+	/*
+	 * Similar to the above, we want to evict enough bytes to get us
+	 * below the meta limit, but not so much as to drop us below the
+	 * space alloted to the MFU (which is defined as arc_c - arc_p).
+	 */
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
 
-		/* ignore markers */
-		if (ab->b_spa == 0)
-			continue;
+	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 
-		hash_lock = HDR_LOCK(ab);
-		/* caller may be trying to modify this buffer, skip it */
-		if (MUTEX_HELD(hash_lock))
-			continue;
-		if (mutex_tryenter(hash_lock)) {
-			ASSERT(!HDR_IO_IN_PROGRESS(ab));
-			ASSERT(ab->b_buf == NULL);
-			ARCSTAT_BUMP(arcstat_deleted);
-			bytes_deleted += ab->b_size;
+	return (total_evicted);
+}
 
-			if (ab->b_l2hdr != NULL) {
-				/*
-				 * This buffer is cached on the 2nd Level ARC;
-				 * don't destroy the header.
-				 */
-				arc_change_state(arc_l2c_only, ab, hash_lock);
-				mutex_exit(hash_lock);
-			} else {
-				arc_change_state(arc_anon, ab, hash_lock);
-				mutex_exit(hash_lock);
-				arc_hdr_destroy(ab);
-			}
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+	int data_idx = multilist_get_random_index(data_ml);
+	int meta_idx = multilist_get_random_index(meta_ml);
+	multilist_sublist_t *data_mls;
+	multilist_sublist_t *meta_mls;
+	arc_buf_contents_t type;
+	arc_buf_hdr_t *data_hdr;
+	arc_buf_hdr_t *meta_hdr;
 
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
-			if (bytes >= 0 && bytes_deleted >= bytes)
-				break;
-		} else if (bytes < 0) {
-			/*
-			 * Insert a list marker and then wait for the
-			 * hash lock to become available. Once its
-			 * available, restart from where we left off.
-			 */
-			list_insert_after(list, ab, &marker);
-			mutex_exit(lock);
-			mutex_enter(hash_lock);
-			mutex_exit(hash_lock);
-			mutex_enter(lock);
-			ab_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-		} else
-			bufs_skipped += 1;
-	}
-	mutex_exit(lock);
-	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
-	count++;
+	/*
+	 * We keep the sublist lock until we're finished, to prevent
+	 * the headers from being destroyed via arc_evict_state().
+	 */
+	data_mls = multilist_sublist_lock(data_ml, data_idx);
+	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
 
-	if (count < list_count)
-		goto evict_start;
+	/*
+	 * These two loops are to ensure we skip any markers that
+	 * might be at the tail of the lists due to arc_evict_state().
+	 */
 
-	evict_offset = idx;
-	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
-	    (bytes < 0 || bytes_deleted < bytes)) {
-		list_start = &state->arcs_lists[0];
-		list_count = ARC_BUFC_NUMMETADATALISTS;
-		offset = count = 0;
-		goto evict_start;
+	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+		if (data_hdr->b_spa != 0)
+			break;
 	}
 
-	if (bufs_skipped) {
-		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-		ASSERT(bytes >= 0);
+	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+		if (meta_hdr->b_spa != 0)
+			break;
 	}
 
-	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
-		    (longlong_t)bytes_deleted, state);
+	if (data_hdr == NULL && meta_hdr == NULL) {
+		type = ARC_BUFC_DATA;
+	} else if (data_hdr == NULL) {
+		ASSERT3P(meta_hdr, !=, NULL);
+		type = ARC_BUFC_METADATA;
+	} else if (meta_hdr == NULL) {
+		ASSERT3P(data_hdr, !=, NULL);
+		type = ARC_BUFC_DATA;
+	} else {
+		ASSERT3P(data_hdr, !=, NULL);
+		ASSERT3P(meta_hdr, !=, NULL);
+
+		/* The headers can't be on the sublist without an L1 header */
+		ASSERT(HDR_HAS_L1HDR(data_hdr));
+		ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+		if (data_hdr->b_l1hdr.b_arc_access <
+		    meta_hdr->b_l1hdr.b_arc_access) {
+			type = ARC_BUFC_DATA;
+		} else {
+			type = ARC_BUFC_METADATA;
+		}
+	}
+
+	multilist_sublist_unlock(meta_mls);
+	multilist_sublist_unlock(data_mls);
+
+	return (type);
 }
 
-static void
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
 arc_adjust(void)
 {
-	int64_t adjustment, delta;
+	uint64_t total_evicted = 0;
+	uint64_t bytes;
+	int64_t target;
 
 	/*
+	 * If we're over arc_meta_limit, we want to correct that before
+	 * potentially evicting data buffers below.
+	 */
+	total_evicted += arc_adjust_meta();
+
+	/*
 	 * Adjust MRU size
+	 *
+	 * If we're over the target cache size, we want to evict enough
+	 * from the list to get back to our target size. We don't want
+	 * to evict too much from the MRU, such that it drops below
+	 * arc_p. So, if we're over our target cache size more than
+	 * the MRU is over arc_p, we'll evict enough to get back to
+	 * arc_p here, and then evict more from the MFU below.
 	 */
+	target = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
 
-	adjustment = MIN((int64_t)(arc_size - arc_c),
-	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
-	    arc_p));
+	/*
+	 * If we're below arc_meta_min, always prefer to evict data.
+	 * Otherwise, try to satisfy the requested number of bytes to
+	 * evict from the type which contains older buffers; in an
+	 * effort to keep newer buffers in the cache regardless of their
+	 * type. If we cannot satisfy the number of bytes from this
+	 * type, spill over into the next type.
+	 */
+	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from metadata.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust MFU size
+	 *
+	 * Now that we've tried to evict enough from the MRU to get its
+	 * size back to arc_p, if we're still above the target cache
+	 * size, we evict the rest from the MFU.
 	 */
+	target = arc_size - arc_c;
 
-	adjustment = arc_size - arc_c;
+	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		int64_t delta = MIN(adjustment,
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
+	 *
+	 * In addition to the above, the ARC also defines target values
+	 * for the ghost lists. The sum of the mru list and mru ghost
+	 * list should never exceed the target size of the cache, and
+	 * the sum of the mru list, mfu list, mru ghost list, and mfu
+	 * ghost list should never exceed twice the target size of the
+	 * cache. The following logic enforces these limits on the ghost
+	 * caches, and evicts from them as needed.
 	 */
+	target = refcount_count(&arc_mru->arcs_size) +
+	    refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
 
-	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
-		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
-	}
+	target -= bytes;
 
-	adjustment =
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+	total_evicted +=
+	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
 
-	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
-		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
-	}
-}
-
-static void
-arc_do_user_evicts(void)
-{
-	static arc_buf_t *tmp_arc_eviction_list;
-
 	/*
-	 * Move list over to avoid LOR
+	 * We assume the sum of the mru list and mfu list is less than
+	 * or equal to arc_c (we enforced this above), which means we
+	 * can use the simpler of the two equations below:
+	 *
+	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+	 *		    mru ghost + mfu ghost <= arc_c
 	 */
-restart:
-	mutex_enter(&arc_eviction_mtx);
-	tmp_arc_eviction_list = arc_eviction_list;
-	arc_eviction_list = NULL;
-	mutex_exit(&arc_eviction_mtx);
+	target = refcount_count(&arc_mru_ghost->arcs_size) +
+	    refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
 
-	while (tmp_arc_eviction_list != NULL) {
-		arc_buf_t *buf = tmp_arc_eviction_list;
-		tmp_arc_eviction_list = buf->b_next;
-		mutex_enter(&buf->b_evict_lock);
-		buf->b_hdr = NULL;
-		mutex_exit(&buf->b_evict_lock);
+	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
 
-		if (buf->b_efunc != NULL)
-			VERIFY(buf->b_efunc(buf) == 0);
+	target -= bytes;
 
-		buf->b_efunc = NULL;
-		buf->b_private = NULL;
-		kmem_cache_free(buf_cache, buf);
-	}
+	total_evicted +=
+	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
 
-	if (arc_eviction_list != NULL)
-		goto restart;
+	return (total_evicted);
 }
 
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
 void
-arc_flush(spa_t *spa)
+arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
-	if (spa)
+	/*
+	 * If retry is B_TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
+	 */
+	ASSERT(!retry || spa == 0);
+
+	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
-	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa)
-			break;
-	}
-	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa)
-			break;
-	}
-	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa)
-			break;
-	}
-	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa)
-			break;
-	}
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
-	arc_evict_ghost(arc_mru_ghost, guid, -1);
-	arc_evict_ghost(arc_mfu_ghost, guid, -1);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_do_user_evicts();
-	mutex_exit(&arc_reclaim_thr_lock);
-	ASSERT(spa || arc_eviction_list == NULL);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
-arc_shrink(void)
+arc_shrink(int64_t to_free)
 {
 	if (arc_c > arc_c_min) {
-		uint64_t to_free;
-
-#ifdef _KERNEL
-		to_free = arc_c >> arc_shrink_shift;
-#else
-		to_free = arc_c >> arc_shrink_shift;
-#endif
+		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
 		if (arc_c > arc_c_min + to_free)
 			atomic_add_64(&arc_c, -to_free);
 		else
@@ -2378,39 +3815,80 @@
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
+
+		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+			arc_p);
+
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
-	if (arc_size > arc_c)
-		arc_adjust();
+	if (arc_size > arc_c) {
+		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
+			uint64_t, arc_c);
+		(void) arc_adjust();
+	}
 }
 
-static int needfree = 0;
+static long needfree = 0;
 
-static int
-arc_reclaim_needed(void)
+typedef enum free_memory_reason_t {
+	FMR_UNKNOWN,
+	FMR_NEEDFREE,
+	FMR_LOTSFREE,
+	FMR_SWAPFS_MINFREE,
+	FMR_PAGES_PP_MAXIMUM,
+	FMR_HEAP_ARENA,
+	FMR_ZIO_ARENA,
+	FMR_ZIO_FRAG,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed.  Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
 {
+	int64_t lowest = INT64_MAX;
+	int64_t n;
+	free_memory_reason_t r = FMR_UNKNOWN;
 
 #ifdef _KERNEL
+	if (needfree > 0) {
+		n = PAGESIZE * (-needfree);
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_NEEDFREE;
+		}
+	}
 
-	if (needfree)
-		return (1);
-
 	/*
 	 * Cooperate with pagedaemon when it's time for it to scan
 	 * and reclaim some pages.
 	 */
-	if (vm_paging_needed())
-		return (1);
+	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_LOTSFREE;
+	}
 
-#ifdef sun
+#ifdef illumos
 	/*
-	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
-	 */
-	extra = desfree;
-
-	/*
 	 * check that we're out of range of the pageout scanner.  It starts to
 	 * schedule paging if freemem is less than lotsfree and needfree.
 	 * lotsfree is the high-water mark for pageout, and needfree is the
@@ -2417,8 +3895,11 @@
 	 * number of needed free pages.  We add extra pages here to make sure
 	 * the scanner doesn't start up while we're freeing memory.
 	 */
-	if (freemem < lotsfree + needfree + extra)
-		return (1);
+	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_LOTSFREE;
+	}
 
 	/*
 	 * check to make sure that swapfs has enough space so that anon
@@ -2427,11 +3908,31 @@
 	 * swap pages.  We also add a bit of extra here just to prevent
 	 * circumstances from getting really dire.
 	 */
-	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-		return (1);
+	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+	    desfree - arc_swapfs_reserve);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_SWAPFS_MINFREE;
+	}
 
-#if defined(__i386)
+
 	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	n = PAGESIZE * (availrmem - pages_pp_maximum -
+	    arc_pages_pp_reserve);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_PAGES_PP_MAXIMUM;
+	}
+
+#endif	/* illumos */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
 	 * kernel heap space before we ever run out of available physical
 	 * memory.  Most checks of the size of the heap_area compare against
@@ -2442,32 +3943,85 @@
 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
 	 * free)
 	 */
-	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-		return (1);
+	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
+	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_HEAP_ARENA;
+	}
+#define	zio_arena	NULL
+#else
+#define	zio_arena	heap_arena
 #endif
-#else	/* !sun */
-	if (kmem_used() > (kmem_size() * 3) / 4)
-		return (1);
-#endif	/* sun */
 
-#else
+	/*
+	 * If zio data pages are being allocated out of a separate heap segment,
+	 * then enforce that the size of available vmem for this arena remains
+	 * above about 1/16th free.
+	 *
+	 * Note: The 1/16th arena free requirement was put in place
+	 * to aggressively evict memory from the arc in order to avoid
+	 * memory fragmentation issues.
+	 */
+	if (zio_arena != NULL) {
+		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
+		    (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_ZIO_ARENA;
+		}
+	}
+
+	/*
+	 * Above limits know nothing about real level of KVA fragmentation.
+	 * Start aggressive reclamation if too little sequential KVA left.
+	 */
+	if (lowest > 0) {
+		n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ?
+		    -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
+		    INT64_MAX;
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_ZIO_FRAG;
+		}
+	}
+
+#else	/* _KERNEL */
+	/* Every 100 calls, free a small amount */
 	if (spa_get_random(100) == 0)
-		return (1);
-#endif
-	return (0);
+		lowest = -1024;
+#endif	/* _KERNEL */
+
+	last_free_memory = lowest;
+	last_free_reason = r;
+	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+	return (lowest);
 }
 
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+	return (arc_available_memory() < 0);
+}
+
 extern kmem_cache_t	*zio_buf_cache[];
 extern kmem_cache_t	*zio_data_buf_cache[];
+extern kmem_cache_t	*range_seg_cache;
 
-static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+static __noinline void
+arc_kmem_reap_now(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
+	DTRACE_PROBE(arc__kmem_reap_start);
 #ifdef _KERNEL
 	if (arc_meta_used >= arc_meta_limit) {
 		/*
@@ -2484,13 +4038,6 @@
 #endif
 #endif
 
-	/*
-	 * An aggressive reclamation will shrink the cache size as well as
-	 * reap free buffers from the arc kmem caches.
-	 */
-	if (strat == ARC_RECLAIM_AGGR)
-		arc_shrink();
-
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
@@ -2502,75 +4049,144 @@
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
-	kmem_cache_reap_now(hdr_cache);
+	kmem_cache_reap_now(hdr_full_cache);
+	kmem_cache_reap_now(hdr_l2only_cache);
+	kmem_cache_reap_now(range_seg_cache);
+
+#ifdef illumos
+	if (zio_arena != NULL) {
+		/*
+		 * Ask the vmem arena to reclaim unused memory from its
+		 * quantum caches.
+		 */
+		vmem_qcache_reap(zio_arena);
+	}
+#endif
+	DTRACE_PROBE(arc__kmem_reap_end);
 }
 
+/*
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *    waiting for the reclaim thread to signal it.
+ *
+ *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ *    fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
+ */
 static void
 arc_reclaim_thread(void *dummy __unused)
 {
-	clock_t			growtime = 0;
-	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
+	hrtime_t		growtime = 0;
 	callb_cpr_t		cpr;
 
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	while (arc_thread_exit == 0) {
-		if (arc_reclaim_needed()) {
+	mutex_enter(&arc_reclaim_lock);
+	while (!arc_reclaim_thread_exit) {
+		uint64_t evicted = 0;
 
-			if (arc_no_grow) {
-				if (last_reclaim == ARC_RECLAIM_CONS) {
-					last_reclaim = ARC_RECLAIM_AGGR;
-				} else {
-					last_reclaim = ARC_RECLAIM_CONS;
-				}
-			} else {
-				arc_no_grow = TRUE;
-				last_reclaim = ARC_RECLAIM_AGGR;
-				membar_producer();
-			}
+		/*
+		 * This is necessary in order for the mdb ::arc dcmd to
+		 * show up to date information. Since the ::arc command
+		 * does not call the kstat's update function, without
+		 * this call, the command may show stale stats for the
+		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+		 * with this change, the data might be up to 1 second
+		 * out of date; but that should suffice. The arc_state_t
+		 * structures can be queried directly if more accurate
+		 * information is needed.
+		 */
+		if (arc_ksp != NULL)
+			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 
-			/* reset the growth delay for every reclaim */
-			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+		mutex_exit(&arc_reclaim_lock);
 
-			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
-				/*
-				 * If needfree is TRUE our vm_lowmem hook
-				 * was called and in that case we must free some
-				 * memory, so switch to aggressive mode.
-				 */
-				arc_no_grow = TRUE;
-				last_reclaim = ARC_RECLAIM_AGGR;
-			}
-			arc_kmem_reap_now(last_reclaim);
+		/*
+		 * We call arc_adjust() before (possibly) calling
+		 * arc_kmem_reap_now(), so that we can wake up
+		 * arc_get_data_buf() sooner.
+		 */
+		evicted = arc_adjust();
+
+		int64_t free_memory = arc_available_memory();
+		if (free_memory < 0) {
+
+			arc_no_grow = B_TRUE;
 			arc_warm = B_TRUE;
 
-		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
-			arc_no_grow = FALSE;
-		}
+			/*
+			 * Wait at least zfs_grow_retry (default 60) seconds
+			 * before considering growing.
+			 */
+			growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 
-		arc_adjust();
+			arc_kmem_reap_now();
 
-		if (arc_eviction_list != NULL)
-			arc_do_user_evicts();
+			/*
+			 * If we are still low on memory, shrink the ARC
+			 * so that we have arc_shrink_min free space.
+			 */
+			free_memory = arc_available_memory();
 
+			int64_t to_free =
+			    (arc_c >> arc_shrink_shift) - free_memory;
+			if (to_free > 0) {
 #ifdef _KERNEL
-		if (needfree) {
+				to_free = MAX(to_free, ptob(needfree));
+#endif
+				arc_shrink(to_free);
+			}
+		} else if (free_memory < arc_c >> arc_no_grow_shift) {
+			arc_no_grow = B_TRUE;
+		} else if (gethrtime() >= growtime) {
+			arc_no_grow = B_FALSE;
+		}
+
+		mutex_enter(&arc_reclaim_lock);
+
+		/*
+		 * If evicted is zero, we couldn't evict anything via
+		 * arc_adjust(). This could be due to hash lock
+		 * collisions, but more likely due to the majority of
+		 * arc buffers being unevictable. Therefore, even if
+		 * arc_size is above arc_c, another pass is unlikely to
+		 * be helpful and could potentially cause us to enter an
+		 * infinite loop.
+		 */
+		if (arc_size <= arc_c || evicted == 0) {
+#ifdef _KERNEL
 			needfree = 0;
-			wakeup(&needfree);
-		}
 #endif
+			/*
+			 * We're either no longer overflowing, or we
+			 * can't evict anything more, so we should wake
+			 * up any threads before we go to sleep.
+			 */
+			cv_broadcast(&arc_reclaim_waiters_cv);
 
-		/* block until needed, or one second, whichever is shorter */
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, hz);
-		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+			/*
+			 * Block until signaled, or after one second (we
+			 * might need to perform arc_kmem_reap_now()
+			 * even if we aren't being signalled)
+			 */
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
+			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+		}
 	}
 
-	arc_thread_exit = 0;
-	cv_broadcast(&arc_reclaim_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	arc_reclaim_thread_exit = B_FALSE;
+	cv_broadcast(&arc_reclaim_thread_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
 	thread_exit();
 }
 
@@ -2584,6 +4200,8 @@
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
 
 	if (state == arc_l2c_only)
 		return;
@@ -2598,8 +4216,7 @@
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
-		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
-		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
@@ -2606,8 +4223,7 @@
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
-		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
-		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
 		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
@@ -2616,7 +4232,7 @@
 	ASSERT((int64_t)arc_p >= 0);
 
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 		return;
 	}
 
@@ -2631,6 +4247,7 @@
 	 * cache size, increment the target cache size
 	 */
 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
@@ -2643,152 +4260,159 @@
 }
 
 /*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
  */
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
 {
-	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
-		return (1);
+	/* Always allow at least one block of overflow */
+	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+	    arc_c >> zfs_arc_overflow_shift);
 
-#ifdef sun
-#ifdef _KERNEL
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then enforce that the size of available vmem for this area remains
-	 * above about 1/32nd free.
-	 */
-	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-	    vmem_size(zio_arena, VMEM_FREE) <
-	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-		return (1);
-#endif
-#endif	/* sun */
-
-	if (arc_reclaim_needed())
-		return (1);
-
-	return (arc_size > arc_c);
+	return (arc_size >= arc_c + overflow);
 }
 
 /*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
  */
-static void
-arc_get_data_buf(arc_buf_t *buf)
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
-	arc_state_t		*state = buf->b_hdr->b_state;
-	uint64_t		size = buf->b_hdr->b_size;
-	arc_buf_contents_t	type = buf->b_hdr->b_type;
+	void *datap = NULL;
+	arc_state_t		*state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t	type = arc_buf_type(hdr);
 
 	arc_adapt(size, state);
 
 	/*
-	 * We have not yet reached cache maximum size,
-	 * just allocate a new buffer.
+	 * If arc_size is currently overflowing, and has grown past our
+	 * upper limit, we must be adding data faster than the evict
+	 * thread can evict. Thus, to ensure we don't compound the
+	 * problem by adding more data and forcing arc_size to grow even
+	 * further past it's target size, we halt and wait for the
+	 * eviction thread to catch up.
+	 *
+	 * It's also possible that the reclaim thread is unable to evict
+	 * enough buffers to get arc_size below the overflow limit (e.g.
+	 * due to buffers being un-evictable, or hash lock collisions).
+	 * In this case, we want to proceed regardless if we're
+	 * overflowing; thus we don't use a while loop here.
 	 */
-	if (!arc_evict_needed(type)) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			ARCSTAT_INCR(arcstat_data_size, size);
-			atomic_add_64(&arc_size, size);
+	if (arc_is_overflowing()) {
+		mutex_enter(&arc_reclaim_lock);
+
+		/*
+		 * Now that we've acquired the lock, we may no longer be
+		 * over the overflow limit, lets check.
+		 *
+		 * We're ignoring the case of spurious wake ups. If that
+		 * were to happen, it'd let this thread consume an ARC
+		 * buffer before it should have (i.e. before we're under
+		 * the overflow limit and were signalled by the reclaim
+		 * thread). As long as that is a rare occurrence, it
+		 * shouldn't cause any harm.
+		 */
+		if (arc_is_overflowing()) {
+			cv_signal(&arc_reclaim_thread_cv);
+			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
 		}
-		goto out;
+
+		mutex_exit(&arc_reclaim_lock);
 	}
 
-	/*
-	 * If we are prefetching from the mfu ghost list, this buffer
-	 * will end up on the mru list; so steal space from there.
-	 */
-	if (state == arc_mfu_ghost)
-		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
-	else if (state == arc_mru_ghost)
-		state = arc_mru;
-
-	if (state == arc_mru || state == arc_anon) {
-		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_mfu->arcs_lsize[type] >= size &&
-		    arc_p > mru_used) ? arc_mfu : arc_mru;
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		datap = zio_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_META);
 	} else {
-		/* MFU cases */
-		uint64_t mfu_space = arc_c - arc_p;
-		state =  (arc_mru->arcs_lsize[type] >= size &&
-		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+		ASSERT(type == ARC_BUFC_DATA);
+		datap = zio_data_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_DATA);
 	}
-	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			ARCSTAT_INCR(arcstat_data_size, size);
-			atomic_add_64(&arc_size, size);
-		}
-		ARCSTAT_BUMP(arcstat_recycle_miss);
-	}
-	ASSERT(buf->b_data != NULL);
-out:
+
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
-	if (!GHOST_STATE(buf->b_hdr->b_state)) {
-		arc_buf_hdr_t *hdr = buf->b_hdr;
+	if (!GHOST_STATE(state)) {
 
-		atomic_add_64(&hdr->b_state->arcs_size, size);
-		if (list_link_active(&hdr->b_arc_node)) {
-			ASSERT(refcount_is_zero(&hdr->b_refcnt));
-			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+		(void) refcount_add_many(&state->arcs_size, size, tag);
+
+		/*
+		 * If this is reached via arc_read, the link is
+		 * protected by the hash lock. If reached via
+		 * arc_buf_alloc, the header should not be accessed by
+		 * any other thread. And, if reached via arc_read_done,
+		 * the hash lock will protect it if it's found in the
+		 * hash table; otherwise no other thread should be
+		 * trying to [add|remove]_reference it.
+		 */
+		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			(void) refcount_add_many(&state->arcs_esize[type],
+			    size, tag);
 		}
+
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
-		if (arc_size < arc_c && hdr->b_state == arc_anon &&
-		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+		    (refcount_count(&arc_anon->arcs_size) +
+		    refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 	ARCSTAT_BUMP(arcstat_allocated);
+	return (datap);
 }
 
 /*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    size, tag);
+	}
+	(void) refcount_remove_many(&state->arcs_size, size, tag);
+
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		zio_buf_free(data, size);
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		zio_data_buf_free(data, size);
+		arc_space_return(size, ARC_SPACE_DATA);
+	}
+}
+
+/*
  * This routine is called whenever a buffer is accessed.
  * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	if (buf->b_state == arc_anon) {
+	if (hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
@@ -2795,12 +4419,12 @@
 		 * to the MRU state.
 		 */
 
-		ASSERT(buf->b_arc_access == 0);
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
-		arc_change_state(arc_mru, buf, hash_lock);
+		ASSERT0(hdr->b_l1hdr.b_arc_access);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mru, hdr, hash_lock);
 
-	} else if (buf->b_state == arc_mru) {
+	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
@@ -2811,14 +4435,16 @@
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			if (refcount_count(&buf->b_refcnt) == 0) {
-				ASSERT(list_link_active(&buf->b_arc_node));
+		if (HDR_PREFETCH(hdr)) {
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+				/* link protected by hash lock */
+				ASSERT(multilist_link_active(
+				    &hdr->b_l1hdr.b_arc_node));
 			} else {
-				buf->b_flags &= ~ARC_PREFETCH;
+				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
-			buf->b_arc_access = now;
+			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
@@ -2827,18 +4453,18 @@
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
-		if (now > buf->b_arc_access + ARC_MINTIME) {
+		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
-			buf->b_arc_access = now;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-			arc_change_state(arc_mfu, buf, hash_lock);
+			hdr->b_l1hdr.b_arc_access = now;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+			arc_change_state(arc_mfu, hdr, hash_lock);
 		}
 		ARCSTAT_BUMP(arcstat_mru_hits);
-	} else if (buf->b_state == arc_mru_ghost) {
+	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
@@ -2846,21 +4472,21 @@
 		 * MFU state.
 		 */
 
-		if (buf->b_flags & ARC_PREFETCH) {
+		if (HDR_PREFETCH(hdr)) {
 			new_state = arc_mru;
-			if (refcount_count(&buf->b_refcnt) > 0)
-				buf->b_flags &= ~ARC_PREFETCH;
-			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
+				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 
-		buf->b_arc_access = ddi_get_lbolt();
-		arc_change_state(new_state, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		arc_change_state(new_state, hdr, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
-	} else if (buf->b_state == arc_mfu) {
+	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
@@ -2870,13 +4496,14 @@
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			ASSERT(refcount_count(&buf->b_refcnt) == 0);
-			ASSERT(list_link_active(&buf->b_arc_node));
+		if ((HDR_PREFETCH(hdr)) != 0) {
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			/* link protected by hash_lock */
+			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
-		buf->b_arc_access = ddi_get_lbolt();
-	} else if (buf->b_state == arc_mfu_ghost) {
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
@@ -2884,28 +4511,28 @@
 		 * MFU state.
 		 */
 
-		if (buf->b_flags & ARC_PREFETCH) {
+		if (HDR_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
-			ASSERT0(refcount_count(&buf->b_refcnt));
+			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
 			new_state = arc_mru;
 		}
 
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		arc_change_state(new_state, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(new_state, hdr, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
-	} else if (buf->b_state == arc_l2c_only) {
+	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		arc_change_state(arc_mfu, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mfu, hdr, hash_lock);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
@@ -2917,8 +4544,8 @@
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	if (zio == NULL || zio->io_error == 0)
-		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	VERIFY(arc_buf_remove_ref(buf, arg));
+		bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
+	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_done_func_t */
@@ -2927,7 +4554,7 @@
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
-		VERIFY(arc_buf_remove_ref(buf, arg));
+		arc_buf_destroy(buf, arg);
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
@@ -2936,18 +4563,30 @@
 }
 
 static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		if (HDR_COMPRESSION_ENABLED(hdr)) {
+			ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
+			    BP_GET_COMPRESS(bp));
+		}
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+	}
+}
+
+static void
 arc_read_done(zio_t *zio)
 {
-	arc_buf_hdr_t	*hdr, *found;
-	arc_buf_t	*buf;
-	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
-	kmutex_t	*hash_lock;
+	arc_buf_hdr_t	*hdr = zio->io_private;
+	arc_buf_t	*abuf = NULL;	/* buffer we're assigning to callback */
+	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list, *acb;
-	int		freeable = FALSE;
+	int		freeable = B_FALSE;
 
-	buf = zio->io_private;
-	hdr = buf->b_hdr;
-
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
@@ -2956,35 +4595,45 @@
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
-	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
-	    &hash_lock);
+	if (HDR_IN_HASH_TABLE(hdr)) {
+		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_dva.dva_word[0], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
+		ASSERT3U(hdr->b_dva.dva_word[1], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
-	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
-	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
-	    (found == hdr && HDR_L2_READING(hdr)));
+		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
+		    &hash_lock);
 
-	hdr->b_flags &= ~ARC_L2_EVICTED;
-	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
-		hdr->b_flags &= ~ARC_L2CACHE;
+		ASSERT((found == hdr &&
+		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+		    (found == hdr && HDR_L2_READING(hdr)));
+		ASSERT3P(hash_lock, !=, NULL);
+	}
 
-	/* byteswap if necessary */
-	callback_list = hdr->b_acb;
-	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
-		dmu_object_byteswap_t bswap =
-		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
-		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-		    byteswap_uint64_array :
-		    dmu_ot_byteswap[bswap].ob_func;
-		func(buf->b_data, hdr->b_size);
+	if (zio->io_error == 0) {
+		/* byteswap if necessary */
+		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+			if (BP_GET_LEVEL(zio->io_bp) > 0) {
+				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+			} else {
+				hdr->b_l1hdr.b_byteswap =
+				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+			}
+		} else {
+			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+		}
 	}
 
-	arc_cksum_compute(buf, B_FALSE);
-#ifdef illumos
-	arc_buf_watch(buf);
-#endif /* illumos */
+	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
-	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+	callback_list = hdr->b_l1hdr.b_acb;
+	ASSERT3P(callback_list, !=, NULL);
+
+	if (hash_lock && zio->io_error == 0 &&
+	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
@@ -2995,35 +4644,55 @@
 	}
 
 	/* create copies of the data buffer for the callers */
-	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
-		if (acb->acb_done) {
+		if (acb->acb_done != NULL) {
+			/*
+			 * If we're here, then this must be a demand read
+			 * since prefetch requests don't have callbacks.
+			 * If a read request has a callback (i.e. acb_done is
+			 * not NULL), then we decompress the data for the
+			 * first request and clone the rest. This avoids
+			 * having to waste cpu resources decompressing data
+			 * that nobody is explicitly waiting to read.
+			 */
 			if (abuf == NULL) {
-				ARCSTAT_BUMP(arcstat_duplicate_reads);
-				abuf = arc_buf_clone(buf);
+				acb->acb_buf = arc_buf_alloc_impl(hdr,
+				    acb->acb_private);
+				if (zio->io_error == 0) {
+					zio->io_error =
+					    arc_decompress(acb->acb_buf);
+				}
+				abuf = acb->acb_buf;
+			} else {
+				add_reference(hdr, acb->acb_private);
+				acb->acb_buf = arc_buf_clone(abuf);
 			}
-			acb->acb_buf = abuf;
-			abuf = NULL;
 		}
 	}
-	hdr->b_acb = NULL;
-	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf) {
-		ASSERT(buf->b_efunc == NULL);
-		ASSERT(hdr->b_datacnt == 1);
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	hdr->b_l1hdr.b_acb = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+	if (abuf == NULL) {
+		/*
+		 * This buffer didn't have a callback so it must
+		 * be a prefetch.
+		 */
+		ASSERT(HDR_PREFETCH(hdr));
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
 	}
 
-	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+	    callback_list != NULL);
 
-	if (zio->io_error != 0) {
-		hdr->b_flags |= ARC_IO_ERROR;
-		if (hdr->b_state != arc_anon)
+	if (zio->io_error == 0) {
+		arc_hdr_verify(hdr, zio->io_bp);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
+		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/*
@@ -3031,9 +4700,9 @@
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
-	cv_broadcast(&hdr->b_cv);
+	cv_broadcast(&hdr->b_l1hdr.b_cv);
 
-	if (hash_lock) {
+	if (hash_lock != NULL) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
@@ -3042,8 +4711,8 @@
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
-		ASSERT3P(hdr->b_state, ==, arc_anon);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
@@ -3065,7 +4734,7 @@
 }
 
 /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
@@ -3084,33 +4753,72 @@
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, uint32_t *arc_flags,
-    const zbookmark_t *zb)
+    void *private, zio_priority_t priority, int zio_flags,
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
-	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf = NULL;
-	kmutex_t *hash_lock;
+	arc_buf_hdr_t *hdr = NULL;
+	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 
+	ASSERT(!BP_IS_EMBEDDED(bp) ||
+	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
 top:
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-	    &hash_lock);
-	if (hdr && hdr->b_datacnt > 0) {
+	if (!BP_IS_EMBEDDED(bp)) {
+		/*
+		 * Embedded BP's have no DVA and require no I/O to "read".
+		 * Create an anonymous arc buf to back it.
+		 */
+		hdr = buf_hash_find(guid, bp, &hash_lock);
+	}
 
-		*arc_flags |= ARC_CACHED;
+	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+		arc_buf_t *buf = NULL;
+		*arc_flags |= ARC_FLAG_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 
-			if (*arc_flags & ARC_WAIT) {
-				cv_wait(&hdr->b_cv, hash_lock);
+			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+			    priority == ZIO_PRIORITY_SYNC_READ) {
+				/*
+				 * This sync read must wait for an
+				 * in-progress async read (e.g. a predictive
+				 * prefetch).  Async reads are queued
+				 * separately at the vdev_queue layer, so
+				 * this is a form of priority inversion.
+				 * Ideally, we would "inherit" the demand
+				 * i/o's priority by moving the i/o from
+				 * the async queue to the synchronous queue,
+				 * but there is currently no mechanism to do
+				 * so.  Track this so that we can evaluate
+				 * the magnitude of this potential performance
+				 * problem.
+				 *
+				 * Note that if the prefetch i/o is already
+				 * active (has been issued to the device),
+				 * the prefetch improved performance, because
+				 * we issued it sooner than we would have
+				 * without the prefetch.
+				 */
+				DTRACE_PROBE1(arc__sync__wait__for__async,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+			}
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+
+			if (*arc_flags & ARC_FLAG_WAIT) {
+				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
-			ASSERT(*arc_flags & ARC_NOWAIT);
+			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
-				arc_callback_t	*acb = NULL;
+				arc_callback_t *acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
@@ -3120,10 +4828,9 @@
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
-				ASSERT(acb->acb_done != NULL);
-				acb->acb_next = hdr->b_acb;
-				hdr->b_acb = acb;
-				add_reference(hdr, hash_lock, private);
+				ASSERT3P(acb->acb_done, !=, NULL);
+				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				return (0);
 			}
@@ -3131,122 +4838,151 @@
 			return (0);
 		}
 
-		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done) {
-			add_reference(hdr, hash_lock, private);
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				/*
+				 * This is a demand read which does not have to
+				 * wait for i/o because we did a predictive
+				 * prefetch i/o for it, which has completed.
+				 */
+				DTRACE_PROBE1(
+				    arc__demand__hit__predictive__prefetch,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_predictive_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+
 			/*
 			 * If this block is already in use, create a new
 			 * copy of the data so that we will be guaranteed
 			 * that arc_release() will always succeed.
 			 */
-			buf = hdr->b_buf;
-			ASSERT(buf);
-			ASSERT(buf->b_data);
-			if (HDR_BUF_AVAILABLE(hdr)) {
-				ASSERT(buf->b_efunc == NULL);
-				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+			buf = hdr->b_l1hdr.b_buf;
+			if (buf == NULL) {
+				ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+				buf = arc_buf_alloc_impl(hdr, private);
+				VERIFY0(arc_decompress(buf));
 			} else {
+				add_reference(hdr, private);
 				buf = arc_buf_clone(buf);
 			}
+			ASSERT3P(buf->b_data, !=, NULL);
 
-		} else if (*arc_flags & ARC_PREFETCH &&
-		    refcount_count(&hdr->b_refcnt) == 0) {
-			hdr->b_flags |= ARC_PREFETCH;
+		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
+		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
-		if (*arc_flags & ARC_L2CACHE)
-			hdr->b_flags |= ARC_L2CACHE;
-		if (*arc_flags & ARC_L2COMPRESS)
-			hdr->b_flags |= ARC_L2COMPRESS;
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, buf, private);
 	} else {
-		uint64_t size = BP_GET_LSIZE(bp);
-		arc_callback_t	*acb;
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		uint64_t psize = BP_GET_PSIZE(bp);
+		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
+		uint64_t size;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
-			arc_buf_hdr_t	*exists;
+			arc_buf_hdr_t *exists = NULL;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
-			buf = arc_buf_alloc(spa, size, private, type);
-			hdr = buf->b_hdr;
-			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
-			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
-			exists = buf_hash_insert(hdr, &hash_lock);
-			if (exists) {
+			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			    BP_GET_COMPRESS(bp), type);
+
+			if (!BP_IS_EMBEDDED(bp)) {
+				hdr->b_dva = *BP_IDENTITY(bp);
+				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				exists = buf_hash_insert(hdr, &hash_lock);
+			}
+			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
-				(void) arc_buf_remove_ref(buf, private);
+				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH) {
-				(void) remove_reference(hdr, hash_lock,
-				    private);
-				hdr->b_flags |= ARC_PREFETCH;
+		} else {
+			/*
+			 * This block is in the ghost cache. If it was L2-only
+			 * (and thus didn't have an L1 hdr), we realloc the
+			 * header to add an L1 hdr.
+			 */
+			if (!HDR_HAS_L1HDR(hdr)) {
+				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+				    hdr_full_cache);
 			}
-			if (*arc_flags & ARC_L2CACHE)
-				hdr->b_flags |= ARC_L2CACHE;
-			if (*arc_flags & ARC_L2COMPRESS)
-				hdr->b_flags |= ARC_L2COMPRESS;
-			if (BP_GET_LEVEL(bp) > 0)
-				hdr->b_flags |= ARC_INDIRECT;
-		} else {
-			/* this block is in the ghost cache */
-			ASSERT(GHOST_STATE(hdr->b_state));
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT0(refcount_count(&hdr->b_refcnt));
-			ASSERT(hdr->b_buf == NULL);
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH)
-				hdr->b_flags |= ARC_PREFETCH;
-			else
-				add_reference(hdr, hash_lock, private);
-			if (*arc_flags & ARC_L2CACHE)
-				hdr->b_flags |= ARC_L2CACHE;
-			if (*arc_flags & ARC_L2COMPRESS)
-				hdr->b_flags |= ARC_L2COMPRESS;
-			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
-			buf->b_hdr = hdr;
-			buf->b_data = NULL;
-			buf->b_efunc = NULL;
-			buf->b_private = NULL;
-			buf->b_next = NULL;
-			hdr->b_buf = buf;
-			ASSERT(hdr->b_datacnt == 0);
-			hdr->b_datacnt = 1;
-			arc_get_data_buf(buf);
+			/*
+			 * This is a delicate dance that we play here.
+			 * This hdr is in the ghost list so we access it
+			 * to move it out of the ghost list before we
+			 * initiate the read. If it's a prefetch then
+			 * it won't have a callback so we'll remove the
+			 * reference that arc_buf_alloc_impl() created. We
+			 * do this after we've called arc_access() to
+			 * avoid hitting an assert in remove_reference().
+			 */
 			arc_access(hdr, hash_lock);
+			arc_hdr_alloc_pdata(hdr);
 		}
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		size = arc_hdr_size(hdr);
 
-		ASSERT(!GHOST_STATE(hdr->b_state));
+		/*
+		 * If compression is enabled on the hdr, then will do
+		 * RAW I/O and will store the compressed data in the hdr's
+		 * data block. Otherwise, the hdr's data block will contain
+		 * the uncompressed data.
+		 */
+		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+			zio_flags |= ZIO_FLAG_RAW;
+		}
 
+		if (*arc_flags & ARC_FLAG_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+		if (BP_GET_LEVEL(bp) > 0)
+			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 
-		ASSERT(hdr->b_acb == NULL);
-		hdr->b_acb = acb;
-		hdr->b_flags |= ARC_IO_IN_PROGRESS;
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+		hdr->b_l1hdr.b_acb = acb;
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 
-		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
-		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
-			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
-			addr = hdr->b_l2hdr->b_daddr;
+		if (HDR_HAS_L2HDR(hdr) &&
+		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out device removal.
 			 */
@@ -3255,18 +4991,25 @@
 				vd = NULL;
 		}
 
-		mutex_exit(hash_lock);
+		if (priority == ZIO_PRIORITY_ASYNC_READ)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+		else
+			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
+		if (hash_lock != NULL)
+			mutex_exit(hash_lock);
+
 		/*
 		 * At this point, we have a level 1 cache miss.  Try again in
 		 * L2ARC if possible.
 		 */
-		ASSERT3U(hdr->b_size, ==, size);
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
+
 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
-		    uint64_t, size, zbookmark_t *, zb);
+		    uint64_t, lsize, zbookmark_phys_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, misses);
 #ifdef _KERNEL
 		curthread->td_ru.ru_inblock++;
@@ -3282,10 +5025,11 @@
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
-			if (hdr->b_l2hdr != NULL &&
+			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
+				void* b_data;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
@@ -3292,15 +5036,20 @@
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
-				cb->l2rcb_buf = buf;
-				cb->l2rcb_spa = spa;
+				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
-				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
+				uint64_t asize = vdev_psize_to_asize(vd, size);
+				if (asize != size) {
+					b_data = zio_data_buf_alloc(asize);
+					cb->l2rcb_data = b_data;
+				} else {
+					b_data = hdr->b_l1hdr.b_pdata;
+				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
-				    addr + size < vd->vdev_psize -
+				    addr + asize < vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
@@ -3309,35 +5058,26 @@
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
-				if (hdr->b_l2hdr->b_compress ==
-				    ZIO_COMPRESS_EMPTY) {
-					rzio = zio_null(pio, spa, vd,
-					    l2arc_read_done, cb,
-					    zio_flags | ZIO_FLAG_DONT_CACHE |
-					    ZIO_FLAG_CANFAIL |
-					    ZIO_FLAG_DONT_PROPAGATE |
-					    ZIO_FLAG_DONT_RETRY);
-				} else {
-					rzio = zio_read_phys(pio, vd, addr,
-					    hdr->b_l2hdr->b_asize,
-					    buf->b_data, ZIO_CHECKSUM_OFF,
-					    l2arc_read_done, cb, priority,
-					    zio_flags | ZIO_FLAG_DONT_CACHE |
-					    ZIO_FLAG_CANFAIL |
-					    ZIO_FLAG_DONT_PROPAGATE |
-					    ZIO_FLAG_DONT_RETRY, B_FALSE);
-				}
+				ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
+				    ZIO_COMPRESS_EMPTY);
+				rzio = zio_read_phys(pio, vd, addr,
+				    asize, b_data,
+				    ZIO_CHECKSUM_OFF,
+				    l2arc_read_done, cb, priority,
+				    zio_flags | ZIO_FLAG_DONT_CACHE |
+				    ZIO_FLAG_CANFAIL |
+				    ZIO_FLAG_DONT_PROPAGATE |
+				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
-				ARCSTAT_INCR(arcstat_l2_read_bytes,
-				    hdr->b_l2hdr->b_asize);
+				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
 
-				if (*arc_flags & ARC_NOWAIT) {
+				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					return (0);
 				}
 
-				ASSERT(*arc_flags & ARC_WAIT);
+				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					return (0);
 
@@ -3360,31 +5100,18 @@
 			}
 		}
 
-		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, zio_flags, zb);
+		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+		    arc_read_done, hdr, priority, zio_flags, zb);
 
-		if (*arc_flags & ARC_WAIT)
+		if (*arc_flags & ARC_FLAG_WAIT)
 			return (zio_wait(rzio));
 
-		ASSERT(*arc_flags & ARC_NOWAIT);
+		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 	return (0);
 }
 
-void
-arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
-{
-	ASSERT(buf->b_hdr != NULL);
-	ASSERT(buf->b_hdr->b_state != arc_anon);
-	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
-	ASSERT(buf->b_efunc == NULL);
-	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
-
-	buf->b_efunc = func;
-	buf->b_private = private;
-}
-
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
@@ -3395,18 +5122,39 @@
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-	    &hash_lock);
+	ASSERT(!BP_IS_EMBEDDED(bp));
+
+	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
-	if (HDR_BUF_AVAILABLE(hdr)) {
-		arc_buf_t *buf = hdr->b_buf;
-		add_reference(hdr, hash_lock, FTAG);
-		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+
+	/*
+	 * We might be trying to free a block that is still doing I/O
+	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+	 * dmu_sync-ed block). If this block is being prefetched, then it
+	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+	 * until the I/O completes. A block may also have a reference if it is
+	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+	 * have written the new block to its final resting place on disk but
+	 * without the dedup flag set. This would have left the hdr in the MRU
+	 * state and discoverable. When the txg finally syncs it detects that
+	 * the block was overridden in open context and issues an override I/O.
+	 * Since this is a dedup block, the override I/O will determine if the
+	 * block is already in the DDT. If so, then it will replace the io_bp
+	 * with the bp from the DDT and allow the I/O to finish. When the I/O
+	 * reaches the done callback, dbuf_write_override_done, it will
+	 * check to see if the io_bp and io_bp_override are identical.
+	 * If they are not, then it indicates that the bp was replaced with
+	 * the bp in the DDT and the override bp is freed. This allows
+	 * us to arrive here with a reference on a block that is being
+	 * freed. So if we have an I/O in progress, or a reference to
+	 * this hdr, then we don't destroy the hdr.
+	 */
+	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+		arc_change_state(arc_anon, hdr, hash_lock);
+		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
-
-		arc_release(buf, FTAG);
-		(void) arc_buf_remove_ref(buf, FTAG);
 	} else {
 		mutex_exit(hash_lock);
 	}
@@ -3414,94 +5162,6 @@
 }
 
 /*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up.  If this arc buf
- * is not yet in the evicted state, it will be put there.
- */
-int
-arc_buf_evict(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	arc_buf_t **bufp;
-	list_t *list, *evicted_list;
-	kmutex_t *lock, *evicted_lock;
-
-	mutex_enter(&buf->b_evict_lock);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
-		/*
-		 * We are in arc_do_user_evicts().
-		 */
-		ASSERT(buf->b_data == NULL);
-		mutex_exit(&buf->b_evict_lock);
-		return (0);
-	} else if (buf->b_data == NULL) {
-		arc_buf_t copy = *buf; /* structure assignment */
-		/*
-		 * We are on the eviction list; process this buffer now
-		 * but let arc_do_user_evicts() do the reaping.
-		 */
-		buf->b_efunc = NULL;
-		mutex_exit(&buf->b_evict_lock);
-		VERIFY(copy.b_efunc(&copy) == 0);
-		return (1);
-	}
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-	hdr = buf->b_hdr;
-	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-
-	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
-	/*
-	 * Pull this buffer off of the hdr
-	 */
-	bufp = &hdr->b_buf;
-	while (*bufp != buf)
-		bufp = &(*bufp)->b_next;
-	*bufp = buf->b_next;
-
-	ASSERT(buf->b_data != NULL);
-	arc_buf_destroy(buf, FALSE, FALSE);
-
-	if (hdr->b_datacnt == 0) {
-		arc_state_t *old_state = hdr->b_state;
-		arc_state_t *evicted_state;
-
-		ASSERT(hdr->b_buf == NULL);
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
-		evicted_state =
-		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-		get_buf_info(hdr, old_state, &list, &lock);
-		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
-		mutex_enter(lock);
-		mutex_enter(evicted_lock);
-
-		arc_change_state(evicted_state, hdr, hash_lock);
-		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		hdr->b_flags |= ARC_IN_HASH_TABLE;
-		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
-
-		mutex_exit(evicted_lock);
-		mutex_exit(lock);
-	}
-	mutex_exit(hash_lock);
-	mutex_exit(&buf->b_evict_lock);
-
-	VERIFY(buf->b_efunc(buf) == 0);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_hdr = NULL;
-	buf->b_next = NULL;
-	kmem_cache_free(buf_cache, buf);
-	return (1);
-}
-
-/*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
@@ -3510,10 +5170,7 @@
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock = NULL;
-	l2arc_buf_hdr_t *l2hdr;
-	uint64_t buf_size;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if it's DMU metadata (level >
@@ -3522,117 +5179,193 @@
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
-	hdr = buf->b_hdr;
 
-	/* this buffer is not on any list */
-	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	if (hdr->b_state == arc_anon) {
-		/* this buffer is already released */
-		ASSERT(buf->b_efunc == NULL);
-	} else {
-		hash_lock = HDR_LOCK(hdr);
-		mutex_enter(hash_lock);
-		hdr = buf->b_hdr;
-		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	/*
+	 * We don't grab the hash lock prior to this check, because if
+	 * the buffer's header is in the arc_anon state, it won't be
+	 * linked into the hash table.
+	 */
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		mutex_exit(&buf->b_evict_lock);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(!HDR_IN_HASH_TABLE(hdr));
+		ASSERT(!HDR_HAS_L2HDR(hdr));
+		ASSERT(HDR_EMPTY(hdr));
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		hdr->b_l1hdr.b_arc_access = 0;
+
+		/*
+		 * If the buf is being overridden then it may already
+		 * have a hdr that is not empty.
+		 */
+		buf_discard_identity(hdr);
+		arc_buf_thaw(buf);
+
+		return;
 	}
 
-	l2hdr = hdr->b_l2hdr;
-	if (l2hdr) {
-		mutex_enter(&l2arc_buflist_mtx);
-		hdr->b_l2hdr = NULL;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	/*
+	 * This assignment is only valid as long as the hash_lock is
+	 * held, we must be careful not to reference state or the
+	 * b_state field after dropping the lock.
+	 */
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	ASSERT3P(state, !=, arc_anon);
+
+	/* this buffer is not on any list */
+	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+		/*
+		 * We have to recheck this conditional again now that
+		 * we're holding the l2ad_mtx to prevent a race with
+		 * another thread which might be concurrently calling
+		 * l2arc_evict(). In that case, l2arc_evict() might have
+		 * destroyed the header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx.
+		 */
+		if (HDR_HAS_L2HDR(hdr)) {
+			l2arc_trim(hdr);
+			arc_hdr_l2hdr_destroy(hdr);
+		}
+
+		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 	}
-	buf_size = hdr->b_size;
 
 	/*
 	 * Do we have more than one buf?
 	 */
-	if (hdr->b_datacnt > 1) {
+	if (hdr->b_l1hdr.b_bufcnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
-		uint64_t blksz = hdr->b_size;
 		uint64_t spa = hdr->b_spa;
-		arc_buf_contents_t type = hdr->b_type;
-		uint32_t flags = hdr->b_flags;
+		uint64_t psize = HDR_GET_PSIZE(hdr);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		VERIFY3U(hdr->b_type, ==, type);
 
-		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+		(void) remove_reference(hdr, hash_lock, tag);
+
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(HDR_SHARED_DATA(hdr));
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			ASSERT(ARC_BUF_LAST(buf));
+		}
+
 		/*
 		 * Pull the data off of this hdr and attach it to
-		 * a new anonymous hdr.
+		 * a new anonymous hdr. Also find the last buffer
+		 * in the hdr's buffer list.
 		 */
-		(void) remove_reference(hdr, hash_lock, tag);
-		bufp = &hdr->b_buf;
-		while (*bufp != buf)
-			bufp = &(*bufp)->b_next;
-		*bufp = buf->b_next;
-		buf->b_next = NULL;
+		arc_buf_t *lastbuf = NULL;
+		bufp = &hdr->b_l1hdr.b_buf;
+		while (*bufp != NULL) {
+			if (*bufp == buf) {
+				*bufp = buf->b_next;
+			}
 
-		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
-		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
-		if (refcount_is_zero(&hdr->b_refcnt)) {
-			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
-			ASSERT3U(*size, >=, hdr->b_size);
-			atomic_add_64(size, -hdr->b_size);
+			/*
+			 * If we've removed a buffer in the middle of
+			 * the list then update the lastbuf and update
+			 * bufp.
+			 */
+			if (*bufp != NULL) {
+				lastbuf = *bufp;
+				bufp = &(*bufp)->b_next;
+			}
 		}
+		buf->b_next = NULL;
+		ASSERT3P(lastbuf, !=, buf);
+		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
-		 * We're releasing a duplicate user data buffer, update
-		 * our statistics accordingly.
+		 * If the current arc_buf_t and the hdr are sharing their data
+		 * buffer, then we must stop sharing that block, transfer
+		 * ownership and setup sharing with a new arc_buf_t at the end
+		 * of the hdr's b_buf list.
 		 */
-		if (hdr->b_type == ARC_BUFC_DATA) {
-			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
-			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
-			    -hdr->b_size);
+		if (arc_buf_is_shared(buf)) {
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			ASSERT(ARC_BUF_LAST(lastbuf));
+			VERIFY(!arc_buf_is_shared(lastbuf));
+
+			/*
+			 * First, sever the block sharing relationship between
+			 * buf and the arc_buf_hdr_t. Then, setup a new
+			 * block sharing relationship with the last buffer
+			 * on the arc_buf_t list.
+			 */
+			arc_unshare_buf(hdr, buf);
+			arc_share_buf(hdr, lastbuf);
+			VERIFY3P(lastbuf->b_data, !=, NULL);
+		} else if (HDR_SHARED_DATA(hdr)) {
+			ASSERT(arc_buf_is_shared(lastbuf));
 		}
-		hdr->b_datacnt -= 1;
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		ASSERT3P(state, !=, arc_l2c_only);
+
+		(void) refcount_remove_many(&state->arcs_size,
+		    HDR_GET_LSIZE(hdr), buf);
+
+		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+			ASSERT3P(state, !=, arc_l2c_only);
+			(void) refcount_remove_many(&state->arcs_esize[type],
+			    HDR_GET_LSIZE(hdr), buf);
+		}
+
+		hdr->b_l1hdr.b_bufcnt -= 1;
 		arc_cksum_verify(buf);
 #ifdef illumos
 		arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
 
 		mutex_exit(hash_lock);
 
-		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-		nhdr->b_size = blksz;
-		nhdr->b_spa = spa;
-		nhdr->b_type = type;
-		nhdr->b_buf = buf;
-		nhdr->b_state = arc_anon;
-		nhdr->b_arc_access = 0;
-		nhdr->b_flags = flags & ARC_L2_WRITING;
-		nhdr->b_l2hdr = NULL;
-		nhdr->b_datacnt = 1;
-		nhdr->b_freeze_cksum = NULL;
-		(void) refcount_add(&nhdr->b_refcnt, tag);
+		/*
+		 * Allocate a new hdr. The new hdr will contain a b_pdata
+		 * buffer which will be freed in arc_write().
+		 */
+		nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
+		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+		ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
+		VERIFY3U(nhdr->b_type, ==, type);
+		ASSERT(!HDR_SHARED_DATA(nhdr));
+
+		nhdr->b_l1hdr.b_buf = buf;
+		nhdr->b_l1hdr.b_bufcnt = 1;
+		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
+
 		mutex_exit(&buf->b_evict_lock);
-		atomic_add_64(&arc_anon->arcs_size, blksz);
+		(void) refcount_add_many(&arc_anon->arcs_size,
+		    HDR_GET_LSIZE(nhdr), buf);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
-		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
-		ASSERT(!list_link_active(&hdr->b_arc_node));
+		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+		/* protected by hash lock, or hdr is on arc_anon */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		if (hdr->b_state != arc_anon)
-			arc_change_state(arc_anon, hdr, hash_lock);
-		hdr->b_arc_access = 0;
-		if (hash_lock)
-			mutex_exit(hash_lock);
+		arc_change_state(arc_anon, hdr, hash_lock);
+		hdr->b_l1hdr.b_arc_access = 0;
+		mutex_exit(hash_lock);
 
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-
-	if (l2hdr) {
-		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
-		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
-		    hdr->b_size, 0);
-		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
-		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
-		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
-		mutex_exit(&l2arc_buflist_mtx);
-	}
 }
 
 int
@@ -3641,22 +5374,12 @@
 	int released;
 
 	mutex_enter(&buf->b_evict_lock);
-	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+	released = (buf->b_data != NULL &&
+	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
-int
-arc_has_callback(arc_buf_t *buf)
-{
-	int callback;
-
-	mutex_enter(&buf->b_evict_lock);
-	callback = (buf->b_efunc != NULL);
-	mutex_exit(&buf->b_evict_lock);
-	return (callback);
-}
-
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
@@ -3664,7 +5387,7 @@
 	int referenced;
 
 	mutex_enter(&buf->b_evict_lock);
-	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
@@ -3676,29 +5399,109 @@
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
+	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
 
-	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
-	callback->awcb_ready(zio, buf, callback->awcb_private);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
 
 	/*
-	 * If the IO is already in progress, then this is a re-write
-	 * attempt, so we need to thaw and re-compute the cksum.
-	 * It is the responsibility of the callback to handle the
-	 * accounting for any re-write attempt.
+	 * If we're reexecuting this zio because the pool suspended, then
+	 * cleanup any state that was previously set the first time the
+	 * callback as invoked.
 	 */
-	if (HDR_IO_IN_PROGRESS(hdr)) {
-		mutex_enter(&hdr->b_freeze_lock);
-		if (hdr->b_freeze_cksum != NULL) {
-			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-			hdr->b_freeze_cksum = NULL;
+	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+		arc_cksum_free(hdr);
+#ifdef illumos
+		arc_buf_unwatch(buf);
+#endif
+		if (hdr->b_l1hdr.b_pdata != NULL) {
+			if (arc_buf_is_shared(buf)) {
+				ASSERT(HDR_SHARED_DATA(hdr));
+
+				arc_unshare_buf(hdr, buf);
+			} else {
+				arc_hdr_free_pdata(hdr);
+			}
 		}
-		mutex_exit(&hdr->b_freeze_lock);
 	}
-	arc_cksum_compute(buf, B_FALSE);
-	hdr->b_flags |= ARC_IO_IN_PROGRESS;
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT(!HDR_SHARED_DATA(hdr));
+	ASSERT(!arc_buf_is_shared(buf));
+
+	callback->awcb_ready(zio, buf, callback->awcb_private);
+
+	if (HDR_IO_IN_PROGRESS(hdr))
+		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+	arc_cksum_compute(buf);
+	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+	enum zio_compress compress;
+	if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+		compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
+		compress = BP_GET_COMPRESS(zio->io_bp);
+	}
+	HDR_SET_PSIZE(hdr, psize);
+	arc_hdr_set_compress(hdr, compress);
+
+	/*
+	 * If the hdr is compressed, then copy the compressed
+	 * zio contents into arc_buf_hdr_t. Otherwise, copy the original
+	 * data buf into the hdr. Ideally, we would like to always copy the
+	 * io_data into b_pdata but the user may have disabled compressed
+	 * arc thus the on-disk block may or may not match what we maintain
+	 * in the hdr's b_pdata field.
+	 */
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+		ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
+		ASSERT3U(psize, >, 0);
+		arc_hdr_alloc_pdata(hdr);
+		bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+	} else {
+		ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+		ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
+		ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT(!arc_buf_is_shared(buf));
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+
+		/*
+		 * This hdr is not compressed so we're able to share
+		 * the arc_buf_t data buffer with the hdr.
+		 */
+		arc_share_buf(hdr, buf);
+		VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
+		    HDR_GET_LSIZE(hdr)));
+	}
+	arc_hdr_verify(hdr, zio->io_bp);
 }
 
 static void
+arc_write_children_ready(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+
+	callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
+
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+	arc_write_callback_t *cb = zio->io_private;
+	if (cb->awcb_physdone != NULL)
+		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
@@ -3705,23 +5508,28 @@
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(hdr->b_acb == NULL);
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
-		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
-		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+		arc_hdr_verify(hdr, zio->io_bp);
+
+		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+			buf_discard_identity(hdr);
+		} else {
+			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		}
 	} else {
-		ASSERT(BUF_EMPTY(hdr));
+		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
-	 * If the block to be written was all-zero, we may have
-	 * compressed it away.  In this case no write was performed
-	 * so there will be no dva/birth/checksum.  The buffer must
-	 * therefore remain anonymous (and uncached).
+	 * If the block to be written was all-zero or compressed enough to be
+	 * embedded in the BP, no write was performed so there will be no
+	 * dva/birth/checksum.  The buffer must therefore remain anonymous
+	 * (and uncached).
 	 */
-	if (!BUF_EMPTY(hdr)) {
+	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
@@ -3730,7 +5538,7 @@
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
-		if (exists) {
+		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
@@ -3740,7 +5548,8 @@
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
-				ASSERT(refcount_is_zero(&exists->b_refcnt));
+				ASSERT(refcount_is_zero(
+				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				mutex_exit(hash_lock);
 				arc_hdr_destroy(exists);
@@ -3754,22 +5563,22 @@
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
-				ASSERT(hdr->b_datacnt == 1);
-				ASSERT(hdr->b_state == arc_anon);
+				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		/* if it's not anon, we are doing a scrub */
-		if (!exists && hdr->b_state == arc_anon)
+		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	}
 
-	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
@@ -3776,52 +5585,78 @@
 }
 
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
+    arc_done_func_t *children_ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 
-	ASSERT(ready != NULL);
-	ASSERT(done != NULL);
+	ASSERT3P(ready, !=, NULL);
+	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
-	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == NULL);
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 	if (l2arc)
-		hdr->b_flags |= ARC_L2CACHE;
-	if (l2arc_compress)
-		hdr->b_flags |= ARC_L2COMPRESS;
+		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
+	callback->awcb_children_ready = children_ready;
+	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
-	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+	/*
+	 * The hdr's b_pdata is now stale, free it now. A new data block
+	 * will be allocated when the zio pipeline calls arc_write_ready().
+	 */
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		/*
+		 * If the buf is currently sharing the data block with
+		 * the hdr then we need to break that relationship here.
+		 * The hdr will remain with a NULL data pointer and the
+		 * buf will take sole ownership of the block.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			arc_unshare_buf(hdr, buf);
+		} else {
+			arc_hdr_free_pdata(hdr);
+		}
+		VERIFY3P(buf->b_data, !=, NULL);
+		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+	}
+	ASSERT(!arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
 
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
+	    arc_write_ready,
+	    (children_ready != NULL) ? arc_write_children_ready : NULL,
+	    arc_write_physdone, arc_write_done, callback,
+	    priority, zio_flags, zb);
+
 	return (zio);
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
-	uint64_t available_memory =
-	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
+	uint64_t available_memory = ptob(freemem);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
 
-#ifdef sun
-#if defined(__i386)
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	available_memory =
-	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
 #endif
-#endif	/* sun */
-	if (available_memory >= zfs_write_limit_max)
+
+	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
 		return (0);
 
 	if (txg > last_txg) {
@@ -3834,7 +5669,7 @@
 	 * continue to let page writes occur as quickly as possible.
 	 */
 	if (curproc == pageproc) {
-		if (page_load > available_memory / 4)
+		if (page_load > MAX(ptob(minfree), available_memory) / 4)
 			return (SET_ERROR(ERESTART));
 		/* Note: reserve is inflated, so we deflate */
 		page_load += reserve / 8;
@@ -3845,20 +5680,6 @@
 		return (SET_ERROR(EAGAIN));
 	}
 	page_load = 0;
-
-	if (arc_size > arc_c_min) {
-		uint64_t evictable_memory =
-		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-	}
-
-	if (inflight_data > available_memory / 4) {
-		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-		return (SET_ERROR(ERESTART));
-	}
 #endif
 	return (0);
 }
@@ -3876,17 +5697,10 @@
 	int error;
 	uint64_t anon_size;
 
-#ifdef ZFS_DEBUG
-	/*
-	 * Once in a while, fail for no reason.  Everything should cope.
-	 */
-	if (spa_get_random(10000) == 0) {
-		dprintf("forcing random failure\n");
-		return (SET_ERROR(ERESTART));
+	if (reserve > arc_c/4 && !arc_no_grow) {
+		arc_c = MIN(arc_c_max, reserve * 4);
+		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
 	}
-#endif
-	if (reserve > arc_c/4 && !arc_no_grow)
-		arc_c = MIN(arc_c_max, reserve * 4);
 	if (reserve > arc_c)
 		return (SET_ERROR(ENOMEM));
 
@@ -3895,7 +5709,8 @@
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
-	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
@@ -3902,7 +5717,8 @@
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, anon_size, txg))
+	error = arc_memory_throttle(reserve, txg);
+	if (error != 0)
 		return (error);
 
 	/*
@@ -3915,12 +5731,14 @@
 
 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
 	    anon_size > arc_c / 4) {
+		uint64_t meta_esize =
+		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+		uint64_t data_esize =
+		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
-		    arc_tempreserve>>10,
-		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
-		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
-		    reserve>>10, arc_c>>10);
+		    arc_tempreserve >> 10, meta_esize >> 10,
+		    data_esize >> 10, reserve >> 10, arc_c >> 10);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
@@ -3927,7 +5745,85 @@
 	return (0);
 }
 
-static kmutex_t arc_lowmem_lock;
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+	size->value.ui64 = refcount_count(&state->arcs_size);
+	evict_data->value.ui64 =
+	    refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+	evict_metadata->value.ui64 =
+	    refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+	arc_stats_t *as = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE) {
+		return (EACCES);
+	} else {
+		arc_kstat_update_state(arc_anon,
+		    &as->arcstat_anon_size,
+		    &as->arcstat_anon_evictable_data,
+		    &as->arcstat_anon_evictable_metadata);
+		arc_kstat_update_state(arc_mru,
+		    &as->arcstat_mru_size,
+		    &as->arcstat_mru_evictable_data,
+		    &as->arcstat_mru_evictable_metadata);
+		arc_kstat_update_state(arc_mru_ghost,
+		    &as->arcstat_mru_ghost_size,
+		    &as->arcstat_mru_ghost_evictable_data,
+		    &as->arcstat_mru_ghost_evictable_metadata);
+		arc_kstat_update_state(arc_mfu,
+		    &as->arcstat_mfu_size,
+		    &as->arcstat_mfu_evictable_data,
+		    &as->arcstat_mfu_evictable_metadata);
+		arc_kstat_update_state(arc_mfu_ghost,
+		    &as->arcstat_mfu_ghost_size,
+		    &as->arcstat_mfu_ghost_evictable_data,
+		    &as->arcstat_mfu_ghost_evictable_metadata);
+	}
+
+	return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+	arc_buf_hdr_t *hdr = obj;
+
+	/*
+	 * We rely on b_dva to generate evenly distributed index
+	 * numbers using buf_hash below. So, as an added precaution,
+	 * let's make sure we never add empty buffers to the arc lists.
+	 */
+	ASSERT(!HDR_EMPTY(hdr));
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * arc_buf_hdr_t will remain constant throughout it's lifetime
+	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+	 * Thus, we don't need to store the header's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+	    multilist_get_num_sublists(ml));
+}
+
 #ifdef _KERNEL
 static eventhandler_tag arc_event_lowmem = NULL;
 
@@ -3935,11 +5831,11 @@
 arc_lowmem(void *arg __unused, int howto __unused)
 {
 
-	/* Serialize access via arc_lowmem_lock. */
-	mutex_enter(&arc_lowmem_lock);
-	mutex_enter(&arc_reclaim_thr_lock);
-	needfree = 1;
-	cv_signal(&arc_reclaim_thr_cv);
+	mutex_enter(&arc_reclaim_lock);
+	/* XXX: Memory deficit should be passed as argument. */
+	needfree = btoc(arc_c >> arc_shrink_shift);
+	DTRACE_PROBE(arc__needfree);
+	cv_signal(&arc_reclaim_thread_cv);
 
 	/*
 	 * It is unsafe to block here in arbitrary threads, because we can come
@@ -3946,23 +5842,131 @@
 	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
 	 * with ARC reclaim thread.
 	 */
-	if (curproc == pageproc) {
-		while (needfree)
-			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
-	}
-	mutex_exit(&arc_reclaim_thr_lock);
-	mutex_exit(&arc_lowmem_lock);
+	if (curproc == pageproc)
+		(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+	mutex_exit(&arc_reclaim_lock);
 }
 #endif
 
+static void
+arc_state_init(void)
+{
+	arc_anon = &ARC_anon;
+	arc_mru = &ARC_mru;
+	arc_mru_ghost = &ARC_mru_ghost;
+	arc_mfu = &ARC_mfu;
+	arc_mfu_ghost = &ARC_mfu_ghost;
+	arc_l2c_only = &ARC_l2c_only;
+
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+
+	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	refcount_create(&arc_anon->arcs_size);
+	refcount_create(&arc_mru->arcs_size);
+	refcount_create(&arc_mru_ghost->arcs_size);
+	refcount_create(&arc_mfu->arcs_size);
+	refcount_create(&arc_mfu_ghost->arcs_size);
+	refcount_create(&arc_l2c_only->arcs_size);
+}
+
+static void
+arc_state_fini(void)
+{
+	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	refcount_destroy(&arc_anon->arcs_size);
+	refcount_destroy(&arc_mru->arcs_size);
+	refcount_destroy(&arc_mru_ghost->arcs_size);
+	refcount_destroy(&arc_mfu->arcs_size);
+	refcount_destroy(&arc_mfu_ghost->arcs_size);
+	refcount_destroy(&arc_l2c_only->arcs_size);
+
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+}
+
+uint64_t
+arc_max_bytes(void)
+{
+	return (arc_c_max);
+}
+
 void
 arc_init(void)
 {
 	int i, prefetch_tunable_set = 0;
 
-	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
@@ -3970,7 +5974,7 @@
 	/* Start out with 1/8 of all memory */
 	arc_c = kmem_size() / 8;
 
-#ifdef sun
+#ifdef illumos
 #ifdef _KERNEL
 	/*
 	 * On architectures where the physical memory can be larger
@@ -3979,29 +5983,42 @@
 	 */
 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
 #endif
-#endif	/* sun */
-	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
-	arc_c_min = MAX(arc_c / 4, 64<<18);
+#endif	/* illumos */
+	/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
+	arc_c_min = MAX(arc_c / 4, arc_abs_min);
 	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
-	if (arc_c * 8 >= 1<<30)
-		arc_c_max = (arc_c * 8) - (1<<30);
+	if (arc_c * 8 >= 1 << 30)
+		arc_c_max = (arc_c * 8) - (1 << 30);
 	else
 		arc_c_max = arc_c_min;
 	arc_c_max = MAX(arc_c * 5, arc_c_max);
 
+	/*
+	 * In userland, there's only the memory pressure that we artificially
+	 * create (see arc_available_memory()).  Don't let arc_c get too
+	 * small, because it can cause transactions to be larger than
+	 * arc_c, causing arc_tempreserve_space() to fail.
+	 */
+#ifndef _KERNEL
+	arc_c_min = arc_c_max / 2;
+#endif
+
 #ifdef _KERNEL
 	/*
 	 * Allow the tunables to override our calculations if they are
-	 * reasonable (ie. over 16MB)
+	 * reasonable.
 	 */
-	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
+	if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) {
 		arc_c_max = zfs_arc_max;
-	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
+		arc_c_min = MIN(arc_c_min, arc_c_max);
+	}
+	if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
 		arc_c_min = zfs_arc_min;
 #endif
 
 	arc_c = arc_c_max;
 	arc_p = (arc_c >> 1);
+	arc_size = 0;
 
 	/* limit meta-data to 1/4 of the arc capacity */
 	arc_meta_limit = arc_c_max / 4;
@@ -4013,6 +6030,12 @@
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
+	if (zfs_arc_meta_min > 0) {
+		arc_meta_min = zfs_arc_meta_min;
+	} else {
+		arc_meta_min = arc_c_min / 2;
+	}
+
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
@@ -4019,9 +6042,18 @@
 	if (zfs_arc_shrink_shift > 0)
 		arc_shrink_shift = zfs_arc_shrink_shift;
 
+	/*
+	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
+	 */
+	if (arc_no_grow_shift >= arc_shrink_shift)
+		arc_no_grow_shift = arc_shrink_shift - 1;
+
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
+	if (zfs_arc_num_sublists_per_state < 1)
+		zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1);
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -4031,48 +6063,10 @@
 	zfs_arc_min = arc_c_min;
 	zfs_arc_max = arc_c_max;
 
-	arc_anon = &ARC_anon;
-	arc_mru = &ARC_mru;
-	arc_mru_ghost = &ARC_mru_ghost;
-	arc_mfu = &ARC_mfu;
-	arc_mfu_ghost = &ARC_mfu_ghost;
-	arc_l2c_only = &ARC_l2c_only;
-	arc_size = 0;
-
-	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
-		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-
-		list_create(&arc_mru->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-		list_create(&arc_mru_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-		list_create(&arc_mfu->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-		list_create(&arc_mfu_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-		list_create(&arc_mfu_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-		list_create(&arc_l2c_only->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	}
-
+	arc_state_init();
 	buf_init();
 
-	arc_thread_exit = 0;
-	arc_eviction_list = NULL;
-	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
-	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+	arc_reclaim_thread_exit = B_FALSE;
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -4079,6 +6073,7 @@
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
+		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
@@ -4090,14 +6085,23 @@
 	    EVENTHANDLER_PRI_FIRST);
 #endif
 
-	arc_dead = FALSE;
+	arc_dead = B_FALSE;
 	arc_warm = B_FALSE;
 
-	if (zfs_write_limit_max == 0)
-		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-	else
-		zfs_write_limit_shift = 0;
-	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * Calculate maximum amount of dirty data per pool.
+	 *
+	 * If it has been set by /etc/system, take that.
+	 * Otherwise, use a percentage of physical memory defined by
+	 * zfs_dirty_data_max_percent (default 10%) with a cap at
+	 * zfs_dirty_data_max_max (default 4GB).
+	 */
+	if (zfs_dirty_data_max == 0) {
+		zfs_dirty_data_max = ptob(physmem) *
+		    zfs_dirty_data_max_percent / 100;
+		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+		    zfs_dirty_data_max_max);
+	}
 
 #ifdef _KERNEL
 	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
@@ -4139,50 +6143,37 @@
 void
 arc_fini(void)
 {
-	int i;
+	mutex_enter(&arc_reclaim_lock);
+	arc_reclaim_thread_exit = B_TRUE;
+	/*
+	 * The reclaim thread will set arc_reclaim_thread_exit back to
+	 * B_FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_reclaim_thread_exit) {
+		cv_signal(&arc_reclaim_thread_cv);
+		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+	}
+	mutex_exit(&arc_reclaim_lock);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_thread_exit = 1;
-	cv_signal(&arc_reclaim_thr_cv);
-	while (arc_thread_exit != 0)
-		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-	mutex_exit(&arc_reclaim_thr_lock);
+	/* Use B_TRUE to ensure *all* buffers are evicted */
+	arc_flush(NULL, B_TRUE);
 
-	arc_flush(NULL);
+	arc_dead = B_TRUE;
 
-	arc_dead = TRUE;
-
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
-	mutex_destroy(&arc_eviction_mtx);
-	mutex_destroy(&arc_reclaim_thr_lock);
-	cv_destroy(&arc_reclaim_thr_cv);
+	mutex_destroy(&arc_reclaim_lock);
+	cv_destroy(&arc_reclaim_thread_cv);
+	cv_destroy(&arc_reclaim_waiters_cv);
 
-	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
-		list_destroy(&arc_mru->arcs_lists[i]);
-		list_destroy(&arc_mru_ghost->arcs_lists[i]);
-		list_destroy(&arc_mfu->arcs_lists[i]);
-		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
-		list_destroy(&arc_l2c_only->arcs_lists[i]);
-
-		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
-	}
-
-	mutex_destroy(&zfs_write_limit_lock);
-
+	arc_state_fini();
 	buf_fini();
 
-	ASSERT(arc_loaned_bytes == 0);
+	ASSERT0(arc_loaned_bytes);
 
-	mutex_destroy(&arc_lowmem_lock);
 #ifdef _KERNEL
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
@@ -4335,7 +6326,7 @@
  */
 
 static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
@@ -4344,19 +6335,19 @@
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
-	if (ab->b_spa != spa_guid) {
+	if (hdr->b_spa != spa_guid) {
 		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
 		return (B_FALSE);
 	}
-	if (ab->b_l2hdr != NULL) {
+	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
 		return (B_FALSE);
 	}
-	if (HDR_IO_IN_PROGRESS(ab)) {
+	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
 		return (B_FALSE);
 	}
-	if (!HDR_L2CACHE(ab)) {
+	if (!HDR_L2CACHE(hdr)) {
 		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
 		return (B_FALSE);
 	}
@@ -4410,20 +6401,6 @@
 	return (next);
 }
 
-static void
-l2arc_hdr_stat_add(void)
-{
-	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
-	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
-	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
-	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
@@ -4499,9 +6476,13 @@
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
-		ASSERT(df->l2df_data != NULL);
-		ASSERT(df->l2df_func != NULL);
-		df->l2df_func(df->l2df_data, df->l2df_size);
+		ASSERT3P(df->l2df_data, !=, NULL);
+		if (df->l2df_type == ARC_BUFC_METADATA) {
+			zio_buf_free(df->l2df_data, df->l2df_size);
+		} else {
+			ASSERT(df->l2df_type == ARC_BUFC_DATA);
+			zio_data_buf_free(df->l2df_data, df->l2df_size);
+		}
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
@@ -4519,18 +6500,18 @@
 	l2arc_write_callback_t *cb;
 	l2arc_dev_t *dev;
 	list_t *buflist;
-	arc_buf_hdr_t *head, *ab, *ab_prev;
-	l2arc_buf_hdr_t *abl2;
+	arc_buf_hdr_t *head, *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
+	int64_t bytes_dropped = 0;
 
 	cb = zio->io_private;
-	ASSERT(cb != NULL);
+	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
-	ASSERT(dev != NULL);
+	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
-	ASSERT(head != NULL);
-	buflist = dev->l2ad_buflist;
-	ASSERT(buflist != NULL);
+	ASSERT3P(head, !=, NULL);
+	buflist = &dev->l2ad_buflist;
+	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
@@ -4537,50 +6518,78 @@
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
-	mutex_enter(&l2arc_buflist_mtx);
-
 	/*
 	 * All writes completed, or an error was hit.
 	 */
-	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
-		ab_prev = list_prev(buflist, ab);
+top:
+	mutex_enter(&dev->l2ad_mtx);
+	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
 
-		hash_lock = HDR_LOCK(ab);
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
-			 * This buffer misses out.  It may be in a stage
-			 * of eviction.  Its ARC_L2_WRITING flag will be
-			 * left set, denying reads to this buffer.
+			 * Missed the hash lock. We must retry so we
+			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
-			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-			continue;
+			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+			/*
+			 * We don't want to rescan the headers we've
+			 * already marked as having been written out, so
+			 * we reinsert the head node so we can pick up
+			 * where we left off.
+			 */
+			list_remove(buflist, head);
+			list_insert_after(buflist, hdr, head);
+
+			mutex_exit(&dev->l2ad_mtx);
+
+			/*
+			 * We wait for the hash lock to become available
+			 * to try and prevent busy waiting, and increase
+			 * the chance we'll be able to acquire the lock
+			 * the next time around.
+			 */
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
 		}
 
-		abl2 = ab->b_l2hdr;
-
 		/*
-		 * Release the temporary compressed buffer as soon as possible.
+		 * We could not have been moved into the arc_l2c_only
+		 * state while in-flight due to our ARC_FLAG_L2_WRITING
+		 * bit being set. Let's just ensure that's being enforced.
 		 */
-		if (abl2->b_compress != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(ab);
+		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
-			list_remove(buflist, ab);
-			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
-			ab->b_l2hdr = NULL;
-			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
-			    ab->b_size, 0);
-			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
-			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+			list_remove(buflist, hdr);
+			l2arc_trim(hdr);
+			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+			ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr));
+			ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+			bytes_dropped += arc_hdr_size(hdr);
+			(void) refcount_remove_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
-		 * Allow ARC to begin reads to this L2ARC entry.
+		 * Allow ARC to begin reads and ghost list evictions to
+		 * this L2ARC entry.
 		 */
-		ab->b_flags &= ~ARC_L2_WRITING;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
@@ -4587,9 +6596,12 @@
 
 	atomic_inc_64(&l2arc_writes_done);
 	list_remove(buflist, head);
-	kmem_cache_free(hdr_cache, head);
-	mutex_exit(&l2arc_buflist_mtx);
+	ASSERT(!HDR_HAS_L1HDR(head));
+	kmem_cache_free(hdr_l2only_cache, head);
+	mutex_exit(&dev->l2ad_mtx);
 
+	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
+
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
@@ -4604,41 +6616,63 @@
 {
 	l2arc_read_callback_t *cb;
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
 	kmutex_t *hash_lock;
-	int equal;
+	boolean_t valid_cksum;
 
-	ASSERT(zio->io_vd != NULL);
+	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	cb = zio->io_private;
-	ASSERT(cb != NULL);
-	buf = cb->l2rcb_buf;
-	ASSERT(buf != NULL);
+	ASSERT3P(cb, !=, NULL);
+	hdr = cb->l2rcb_hdr;
+	ASSERT3P(hdr, !=, NULL);
 
-	hash_lock = HDR_LOCK(buf->b_hdr);
+	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
-	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
-	 * If the buffer was compressed, decompress it first.
+	 * If the data was read into a temporary buffer,
+	 * move it and free the buffer.
 	 */
-	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
-		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
-	ASSERT(zio->io_data != NULL);
+	if (cb->l2rcb_data != NULL) {
+		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+		if (zio->io_error == 0) {
+			bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata,
+			    arc_hdr_size(hdr));
+		}
 
+		/*
+		 * The following must be done regardless of whether
+		 * there was an error:
+		 * - free the temporary buffer
+		 * - point zio to the real ARC buffer
+		 * - set zio size accordingly
+		 * These are required because zio is either re-used for
+		 * an I/O of the block in the case of the error
+		 * or the zio is passed to arc_read_done() and it
+		 * needs real data.
+		 */
+		zio_data_buf_free(cb->l2rcb_data, zio->io_size);
+		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata;
+	}
+
+	ASSERT3P(zio->io_data, !=, NULL);
+
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
-	equal = arc_cksum_equal(buf);
-	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+	ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
+	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+
+	valid_cksum = arc_cksum_is_equal(hdr, zio);
+	if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
-		zio->io_private = buf;
-		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
-		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		mutex_exit(hash_lock);
@@ -4651,7 +6685,7 @@
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
-		if (!equal)
+		if (!valid_cksum)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
@@ -4664,9 +6698,10 @@
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
-			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
-			    buf->b_data, zio->io_size, arc_read_done, buf,
-			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+			zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
+			    hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+			    hdr, zio->io_priority, cb->l2rcb_flags,
+			    &cb->l2rcb_zb));
 		}
 	}
 
@@ -4683,36 +6718,37 @@
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
 {
-	list_t *list = NULL;
-	int idx;
+	multilist_t *ml = NULL;
+	unsigned int idx;
 
-	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
+	ASSERT(list_num >= 0 && list_num <= 3);
 
-	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
-		idx = list_num;
-		list = &arc_mfu->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mfu, idx);
-	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
-		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
-		list = &arc_mru->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mru, idx);
-	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
-		ARC_BUFC_NUMDATALISTS)) {
-		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
-		list = &arc_mfu->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mfu, idx);
-	} else {
-		idx = list_num - ARC_BUFC_NUMLISTS;
-		list = &arc_mru->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mru, idx);
+	switch (list_num) {
+	case 0:
+		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 1:
+		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 2:
+		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+		break;
+	case 3:
+		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
+		break;
 	}
 
-	ASSERT(!(MUTEX_HELD(*lock)));
-	mutex_enter(*lock);
-	return (list);
+	/*
+	 * Return a randomly-selected sublist. This is acceptable
+	 * because the caller feeds only a little bit of data for each
+	 * call (8MB). Subsequent calls will result in different
+	 * sublists being selected.
+	 */
+	idx = multilist_get_random_index(ml);
+	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
@@ -4725,16 +6761,12 @@
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
-	l2arc_buf_hdr_t *abl2;
-	arc_buf_hdr_t *ab, *ab_prev;
+	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 
-	buflist = dev->l2ad_buflist;
+	buflist = &dev->l2ad_buflist;
 
-	if (buflist == NULL)
-		return;
-
 	if (!all && dev->l2ad_first) {
 		/*
 		 * This is the first sweep through the device.  There is
@@ -4756,35 +6788,41 @@
 	    uint64_t, taddr, boolean_t, all);
 
 top:
-	mutex_enter(&l2arc_buflist_mtx);
-	for (ab = list_tail(buflist); ab; ab = ab_prev) {
-		ab_prev = list_prev(buflist, ab);
+	mutex_enter(&dev->l2ad_mtx);
+	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
 
-		hash_lock = HDR_LOCK(ab);
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
-			mutex_exit(&l2arc_buflist_mtx);
+			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
-		if (HDR_L2_WRITE_HEAD(ab)) {
+		if (HDR_L2_WRITE_HEAD(hdr)) {
 			/*
 			 * We hit a write head node.  Leave it for
 			 * l2arc_write_done().
 			 */
-			list_remove(buflist, ab);
+			list_remove(buflist, hdr);
 			mutex_exit(hash_lock);
 			continue;
 		}
 
-		if (!all && ab->b_l2hdr != NULL &&
-		    (ab->b_l2hdr->b_daddr > taddr ||
-		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+		if (!all && HDR_HAS_L2HDR(hdr) &&
+		    (hdr->b_l2hdr.b_daddr >= taddr ||
+		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
@@ -4793,64 +6831,43 @@
 			break;
 		}
 
-		if (HDR_FREE_IN_PROGRESS(ab)) {
+		ASSERT(HDR_HAS_L2HDR(hdr));
+		if (!HDR_HAS_L1HDR(hdr)) {
+			ASSERT(!HDR_L2_READING(hdr));
 			/*
-			 * Already on the path to destruction.
-			 */
-			mutex_exit(hash_lock);
-			continue;
-		}
-
-		if (ab->b_state == arc_l2c_only) {
-			ASSERT(!HDR_L2_READING(ab));
-			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_size.
 			 */
-			arc_change_state(arc_anon, ab, hash_lock);
-			arc_hdr_destroy(ab);
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
 		} else {
+			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
-			if (HDR_L2_READING(ab)) {
+			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
-				ab->b_flags |= ARC_L2_EVICTED;
+				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
-			/*
-			 * Tell ARC this no longer exists in L2ARC.
-			 */
-			if (ab->b_l2hdr != NULL) {
-				abl2 = ab->b_l2hdr;
-				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
-				ab->b_l2hdr = NULL;
-				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
-				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
-			}
-			list_remove(buflist, ab);
+			/* Ensure this header has finished being written */
+			ASSERT(!HDR_L2_WRITING(hdr));
 
-			/*
-			 * This may have been leftover after a
-			 * failed write.
-			 */
-			ab->b_flags &= ~ARC_L2_WRITING;
+			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
-	mutex_exit(&l2arc_buflist_mtx);
-
-	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
-	dev->l2ad_evict = taddr;
+	mutex_exit(&dev->l2ad_mtx);
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
- * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
@@ -4859,48 +6876,32 @@
  * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
-    boolean_t *headroom_boost)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t *ab, *ab_prev, *head;
-	list_t *list;
-	uint64_t write_asize, write_psize, write_sz, headroom,
-	    buf_compress_minsz;
-	void *buf_data;
-	kmutex_t *list_lock;
+	arc_buf_hdr_t *hdr, *hdr_prev, *head;
+	uint64_t write_asize, write_psize, write_sz, headroom;
 	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_load_guid(spa);
-	const boolean_t do_headroom_boost = *headroom_boost;
 	int try;
 
-	ASSERT(dev->l2ad_vdev != NULL);
+	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
-	/* Lower the flag now, we might want to raise it again later. */
-	*headroom_boost = B_FALSE;
-
 	pio = NULL;
 	write_sz = write_asize = write_psize = 0;
 	full = B_FALSE;
-	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-	head->b_flags |= ARC_L2_WRITE_HEAD;
+	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 
 	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
 	/*
-	 * We will want to try to compress buffers that are at least 2x the
-	 * device sector size.
-	 */
-	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
-
-	/*
 	 * Copy buffers for L2ARC writing.
 	 */
-	mutex_enter(&l2arc_buflist_mtx);
-	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
+	for (try = 0; try <= 3; try++) {
+		multilist_sublist_t *mls = l2arc_sublist_lock(try);
 		uint64_t passed_sz = 0;
 
-		list = l2arc_list_locked(try, &list_lock);
 		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
 		/*
@@ -4910,28 +6911,27 @@
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
-			ab = list_head(list);
+			hdr = multilist_sublist_head(mls);
 		else
-			ab = list_tail(list);
-		if (ab == NULL)
+			hdr = multilist_sublist_tail(mls);
+		if (hdr == NULL)
 			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
 
 		headroom = target_sz * l2arc_headroom;
-		if (do_headroom_boost)
+		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
-		for (; ab; ab = ab_prev) {
-			l2arc_buf_hdr_t *l2hdr;
+		for (; hdr; hdr = hdr_prev) {
 			kmutex_t *hash_lock;
-			uint64_t buf_sz;
 
 			if (arc_warm == B_FALSE)
-				ab_prev = list_next(list, ab);
+				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
-				ab_prev = list_prev(list, ab);
-			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
+				hdr_prev = multilist_sublist_prev(mls, hdr);
+			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
+			    HDR_GET_LSIZE(hdr));
 
-			hash_lock = HDR_LOCK(ab);
+			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
 				/*
@@ -4940,7 +6940,7 @@
 				continue;
 			}
 
-			passed_sz += ab->b_size;
+			passed_sz += HDR_GET_LSIZE(hdr);
 			if (passed_sz > headroom) {
 				/*
 				 * Searched too far.
@@ -4950,12 +6950,27 @@
 				break;
 			}
 
-			if (!l2arc_write_eligible(guid, ab)) {
+			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
-			if ((write_sz + ab->b_size) > target_sz) {
+			/*
+			 * We rely on the L1 portion of the header below, so
+			 * it's invalid for this header to have been evicted out
+			 * of the ghost cache, prior to being written out. The
+			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 */
+			ASSERT(HDR_HAS_L1HDR(hdr));
+
+			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			ASSERT3U(arc_hdr_size(hdr), >, 0);
+			uint64_t size = arc_hdr_size(hdr);
+			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+			    size);
+
+			if ((write_psize + asize) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_l2_write_full);
@@ -4968,7 +6983,9 @@
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				list_insert_head(dev->l2ad_buflist, head);
+				mutex_enter(&dev->l2ad_mtx);
+				list_insert_head(&dev->l2ad_buflist, head);
+				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -4979,43 +6996,63 @@
 				ARCSTAT_BUMP(arcstat_l2_write_pios);
 			}
 
-			/*
-			 * Create and add a new L2ARC header.
-			 */
-			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
-			l2hdr->b_dev = dev;
-			ab->b_flags |= ARC_L2_WRITING;
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			arc_hdr_set_flags(hdr,
+			    ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
 
+			mutex_enter(&dev->l2ad_mtx);
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+
+			(void) refcount_add_many(&dev->l2ad_alloc, size, hdr);
+
 			/*
-			 * Temporarily stash the data buffer in b_tmp_cdata.
-			 * The subsequent write step will pick it up from
-			 * there. This is because can't access ab->b_buf
-			 * without holding the hash_lock, which we in turn
-			 * can't access without holding the ARC list locks
-			 * (which we want to avoid during compression/writing).
+			 * Normally the L2ARC can use the hdr's data, but if
+			 * we're sharing data between the hdr and one of its
+			 * bufs, L2ARC needs its own copy of the data so that
+			 * the ZIO below can't race with the buf consumer. To
+			 * ensure that this copy will be available for the
+			 * lifetime of the ZIO and be cleaned up afterwards, we
+			 * add it to the l2arc_free_on_write queue.
 			 */
-			l2hdr->b_compress = ZIO_COMPRESS_OFF;
-			l2hdr->b_asize = ab->b_size;
-			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+			void *to_write;
+			if (!HDR_SHARED_DATA(hdr) && size == asize) {
+				to_write = hdr->b_l1hdr.b_pdata;
+			} else {
+				arc_buf_contents_t type = arc_buf_type(hdr);
+				if (type == ARC_BUFC_METADATA) {
+					to_write = zio_buf_alloc(asize);
+				} else {
+					ASSERT3U(type, ==, ARC_BUFC_DATA);
+					to_write = zio_data_buf_alloc(asize);
+				}
 
-			buf_sz = ab->b_size;
-			ab->b_l2hdr = l2hdr;
+				bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
+				if (asize != size)
+					bzero(to_write + size, asize - size);
+				l2arc_free_data_on_write(to_write, asize, type);
+			}
+			wzio = zio_write_phys(pio, dev->l2ad_vdev,
+			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    ZIO_CHECKSUM_OFF, NULL, hdr,
+			    ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			list_insert_head(dev->l2ad_buflist, ab);
+			write_sz += HDR_GET_LSIZE(hdr);
+			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+			    zio_t *, wzio);
 
-			/*
-			 * Compute and store the buffer cksum before
-			 * writing.  On debug the cksum is verified first.
-			 */
-			arc_cksum_verify(ab->b_buf);
-			arc_cksum_compute(ab->b_buf, B_TRUE);
+			write_asize += size;
+			write_psize += asize;
+			dev->l2ad_hand += asize;
 
 			mutex_exit(hash_lock);
 
-			write_sz += buf_sz;
+			(void) zio_nowait(wzio);
 		}
 
-		mutex_exit(list_lock);
+		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
@@ -5024,79 +7061,17 @@
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_sz);
-		mutex_exit(&l2arc_buflist_mtx);
-		kmem_cache_free(hdr_cache, head);
+		ASSERT(!HDR_HAS_L1HDR(head));
+		kmem_cache_free(hdr_l2only_cache, head);
 		return (0);
 	}
 
-	/*
-	 * Now start writing the buffers. We're starting at the write head
-	 * and work backwards, retracing the course of the buffer selector
-	 * loop above.
-	 */
-	for (ab = list_prev(dev->l2ad_buflist, head); ab;
-	    ab = list_prev(dev->l2ad_buflist, ab)) {
-		l2arc_buf_hdr_t *l2hdr;
-		uint64_t buf_sz;
-
-		/*
-		 * We shouldn't need to lock the buffer here, since we flagged
-		 * it as ARC_L2_WRITING in the previous step, but we must take
-		 * care to only access its L2 cache parameters. In particular,
-		 * ab->b_buf may be invalid by now due to ARC eviction.
-		 */
-		l2hdr = ab->b_l2hdr;
-		l2hdr->b_daddr = dev->l2ad_hand;
-
-		if ((ab->b_flags & ARC_L2COMPRESS) &&
-		    l2hdr->b_asize >= buf_compress_minsz) {
-			if (l2arc_compress_buf(l2hdr)) {
-				/*
-				 * If compression succeeded, enable headroom
-				 * boost on the next scan cycle.
-				 */
-				*headroom_boost = B_TRUE;
-			}
-		}
-
-		/*
-		 * Pick up the buffer data we had previously stashed away
-		 * (and now potentially also compressed).
-		 */
-		buf_data = l2hdr->b_tmp_cdata;
-		buf_sz = l2hdr->b_asize;
-
-		/* Compression may have squashed the buffer to zero length. */
-		if (buf_sz != 0) {
-			uint64_t buf_p_sz;
-
-			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_CANFAIL, B_FALSE);
-
-			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
-			    zio_t *, wzio);
-			(void) zio_nowait(wzio);
-
-			write_asize += buf_sz;
-			/*
-			 * Keep the clock hand suitably device-aligned.
-			 */
-			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-			write_psize += buf_p_sz;
-			dev->l2ad_hand += buf_p_sz;
-		}
-	}
-
-	mutex_exit(&l2arc_buflist_mtx);
-
-	ASSERT3U(write_asize, <=, target_sz);
+	ASSERT3U(write_psize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
 	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
-	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
@@ -5103,10 +7078,7 @@
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		vdev_space_update(dev->l2ad_vdev,
-		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
-		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
@@ -5118,152 +7090,6 @@
 }
 
 /*
- * Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
- * size in l2hdr->b_asize. This routine tries to compress the data and
- * depending on the compression result there are three possible outcomes:
- * *) The buffer was incompressible. The original l2hdr contents were left
- *    untouched and are ready for writing to an L2 device.
- * *) The buffer was all-zeros, so there is no need to write it to an L2
- *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
- *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
- * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
- *    data buffer which holds the compressed data to be written, and b_asize
- *    tells us how much data there is. b_compress is set to the appropriate
- *    compression algorithm. Once writing is done, invoke
- *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
- *
- * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
- * buffer was incompressible).
- */
-static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
-{
-	void *cdata;
-	size_t csize, len;
-
-	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
-	ASSERT(l2hdr->b_tmp_cdata != NULL);
-
-	len = l2hdr->b_asize;
-	cdata = zio_data_buf_alloc(len);
-	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
-	    cdata, l2hdr->b_asize);
-
-	if (csize == 0) {
-		/* zero block, indicate that there's nothing to write */
-		zio_data_buf_free(cdata, len);
-		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
-		l2hdr->b_asize = 0;
-		l2hdr->b_tmp_cdata = NULL;
-		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
-		return (B_TRUE);
-	} else if (csize > 0 && csize < len) {
-		/*
-		 * Compression succeeded, we'll keep the cdata around for
-		 * writing and release it afterwards.
-		 */
-		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
-		l2hdr->b_asize = csize;
-		l2hdr->b_tmp_cdata = cdata;
-		ARCSTAT_BUMP(arcstat_l2_compress_successes);
-		return (B_TRUE);
-	} else {
-		/*
-		 * Compression failed, release the compressed buffer.
-		 * l2hdr will be left unmodified.
-		 */
-		zio_data_buf_free(cdata, len);
-		ARCSTAT_BUMP(arcstat_l2_compress_failures);
-		return (B_FALSE);
-	}
-}
-
-/*
- * Decompresses a zio read back from an l2arc device. On success, the
- * underlying zio's io_data buffer is overwritten by the uncompressed
- * version. On decompression error (corrupt compressed stream), the
- * zio->io_error value is set to signal an I/O error.
- *
- * Please note that the compressed data stream is not checksummed, so
- * if the underlying device is experiencing data corruption, we may feed
- * corrupt data to the decompressor, so the decompressor needs to be
- * able to handle this situation (LZ4 does).
- */
-static void
-l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
-{
-	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
-
-	if (zio->io_error != 0) {
-		/*
-		 * An io error has occured, just restore the original io
-		 * size in preparation for a main pool read.
-		 */
-		zio->io_orig_size = zio->io_size = hdr->b_size;
-		return;
-	}
-
-	if (c == ZIO_COMPRESS_EMPTY) {
-		/*
-		 * An empty buffer results in a null zio, which means we
-		 * need to fill its io_data after we're done restoring the
-		 * buffer's contents.
-		 */
-		ASSERT(hdr->b_buf != NULL);
-		bzero(hdr->b_buf->b_data, hdr->b_size);
-		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
-	} else {
-		ASSERT(zio->io_data != NULL);
-		/*
-		 * We copy the compressed data from the start of the arc buffer
-		 * (the zio_read will have pulled in only what we need, the
-		 * rest is garbage which we will overwrite at decompression)
-		 * and then decompress back to the ARC data buffer. This way we
-		 * can minimize copying by simply decompressing back over the
-		 * original compressed data (rather than decompressing to an
-		 * aux buffer and then copying back the uncompressed buffer,
-		 * which is likely to be much larger).
-		 */
-		uint64_t csize;
-		void *cdata;
-
-		csize = zio->io_size;
-		cdata = zio_data_buf_alloc(csize);
-		bcopy(zio->io_data, cdata, csize);
-		if (zio_decompress_data(c, cdata, zio->io_data, csize,
-		    hdr->b_size) != 0)
-			zio->io_error = EIO;
-		zio_data_buf_free(cdata, csize);
-	}
-
-	/* Restore the expected uncompressed IO size. */
-	zio->io_orig_size = zio->io_size = hdr->b_size;
-}
-
-/*
- * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
- * This buffer serves as a temporary holder of compressed data while
- * the buffer entry is being written to an l2arc device. Once that is
- * done, we can dispose of it.
- */
-static void
-l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
-{
-	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
-
-	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
-		/*
-		 * If the data was compressed, then we've allocated a
-		 * temporary buffer for it, so now we need to release it.
-		 */
-		ASSERT(l2hdr->b_tmp_cdata != NULL);
-		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
-	}
-	l2hdr->b_tmp_cdata = NULL;
-}
-
-/*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
@@ -5275,7 +7101,6 @@
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
-	boolean_t headroom_boost = B_FALSE;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -5313,7 +7138,7 @@
 			continue;
 
 		spa = dev->l2ad_spa;
-		ASSERT(spa != NULL);
+		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
@@ -5346,7 +7171,7 @@
 		/*
 		 * Write ARC buffers.
 		 */
-		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
+		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
@@ -5388,6 +7213,8 @@
 
 	ASSERT(!l2arc_vdev_present(vd));
 
+	vdev_ashift_optimize(vd);
+
 	/*
 	 * Create a new l2arc device entry.
 	 */
@@ -5397,19 +7224,19 @@
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
-	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 
+	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
-	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
-	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l2node));
+	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+	refcount_create(&adddev->l2ad_alloc);
 
 	/*
 	 * Add device to global list
@@ -5439,7 +7266,7 @@
 			break;
 		}
 	}
-	ASSERT(remdev != NULL);
+	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Remove device from global list
@@ -5453,8 +7280,9 @@
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
-	list_destroy(remdev->l2ad_buflist);
-	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+	list_destroy(&remdev->l2ad_buflist);
+	mutex_destroy(&remdev->l2ad_mtx);
+	refcount_destroy(&remdev->l2ad_alloc);
 	kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
@@ -5469,7 +7297,6 @@
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
@@ -5494,7 +7321,6 @@
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
-	mutex_destroy(&l2arc_buflist_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/bpobj.h>
@@ -37,22 +38,20 @@
 uint64_t
 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
-	zfeature_info_t *empty_bpobj_feat =
-	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
 	spa_t *spa = dmu_objset_spa(os);
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
-	if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
-		if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 			ASSERT0(dp->dp_empty_bpobj);
 			dp->dp_empty_bpobj =
-			    bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
 			VERIFY(zap_add(os,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 			    &dp->dp_empty_bpobj, tx) == 0);
 		}
-		spa_feature_incr(spa, empty_bpobj_feat, tx);
+		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
 		ASSERT(dp->dp_empty_bpobj != 0);
 		return (dp->dp_empty_bpobj);
 	} else {
@@ -63,12 +62,11 @@
 void
 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
 {
-	zfeature_info_t *empty_bpobj_feat =
-	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
-	spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
-	if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+	if (!spa_feature_is_active(dmu_objset_spa(os),
+	    SPA_FEATURE_EMPTY_BPOBJ)) {
 		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, tx));
@@ -196,6 +194,13 @@
 	mutex_destroy(&bpo->bpo_lock);
 }
 
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+	return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
 static int
 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
     boolean_t free)
@@ -253,9 +258,8 @@
 		dbuf = NULL;
 	}
 	if (free) {
-		i++;
 		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
-		    i * sizeof (blkptr_t), -1ULL, tx));
+		    (i + 1) * sizeof (blkptr_t), -1ULL, tx));
 	}
 	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
 		goto out;
@@ -266,6 +270,7 @@
 		mutex_exit(&bpo->bpo_lock);
 		return (err);
 	}
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
 	epb = doi.doi_data_block_size / sizeof (uint64_t);
 
 	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
@@ -297,8 +302,10 @@
 		if (free) {
 			err = bpobj_space(&sublist,
 			    &used_before, &comp_before, &uncomp_before);
-			if (err)
+			if (err != 0) {
+				bpobj_close(&sublist);
 				break;
+			}
 		}
 		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
 		if (free) {
@@ -335,9 +342,11 @@
 
 out:
 	/* If there are no entries, there should be no bytes. */
-	ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
-	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
-	    bpo->bpo_phys->bpo_bytes == 0);
+	if (!bpobj_hasentries(bpo)) {
+		ASSERT0(bpo->bpo_phys->bpo_bytes);
+		ASSERT0(bpo->bpo_phys->bpo_comp);
+		ASSERT0(bpo->bpo_phys->bpo_uncomp);
+	}
 
 	mutex_exit(&bpo->bpo_lock);
 	return (err);
@@ -380,7 +389,7 @@
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
-	if (used == 0) {
+	if (!bpobj_hasentries(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
@@ -390,7 +399,8 @@
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	if (bpo->bpo_phys->bpo_subobjs == 0) {
 		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OT_NONE, 0, tx);
 	}
 
 	dmu_object_info_t doi;
@@ -456,13 +466,29 @@
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
+	if (BP_IS_EMBEDDED(bp)) {
+		/*
+		 * The bpobj will compress better without the payload.
+		 *
+		 * Note that we store EMBEDDED bp's because they have an
+		 * uncompressed size, which must be accounted for.  An
+		 * alternative would be to add their size to bpo_uncomp
+		 * without storing the bp, but that would create additional
+		 * complications: bpo_uncomp would be inconsistent with the
+		 * set of BP's stored, and bpobj_iterate() wouldn't visit
+		 * all the space accounted for in the bpobj.
+		 */
+		bzero(&stored_bp, sizeof (stored_bp));
+		stored_bp.blk_prop = bp->blk_prop;
+		stored_bp.blk_birth = bp->blk_birth;
+	} else if (!BP_GET_DEDUP(bp)) {
+		/* The bpobj will compress better without the checksum */
+		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+	}
+
 	/* We never need the fill count. */
 	stored_bp.blk_fill = 0;
 
-	/* The bpobj will compress better if we can leave off the checksum */
-	if (!BP_GET_DEDUP(bp))
-		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-
 	mutex_enter(&bpo->bpo_lock);
 
 	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/arc.h>
@@ -66,7 +67,7 @@
 	bptree_phys_t *bt;
 
 	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-	    SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
 	    sizeof (bptree_phys_t), tx);
 
 	/*
@@ -103,6 +104,20 @@
 	return (dmu_object_free(os, obj, tx));
 }
 
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	boolean_t rv;
+
+	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	rv = (bt->bt_begin == bt->bt_end);
+	dmu_buf_rele(db, FTAG);
+	return (rv);
+}
+
 void
 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
@@ -109,7 +124,7 @@
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
-	bptree_entry_phys_t bte;
+	bptree_entry_phys_t bte = { 0 };
 
 	/*
 	 * bptree objects are in the pool mos, therefore they can only be
@@ -123,7 +138,6 @@
 
 	bte.be_birth_txg = birth_txg;
 	bte.be_bp = *bp;
-	bzero(&bte.be_zb, sizeof (bte.be_zb));
 	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
 
 	dmu_buf_will_dirty(db, tx);
@@ -137,12 +151,12 @@
 /* ARGSUSED */
 static int
 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	int err;
 	struct bptree_args *ba = arg;
 
-	if (bp == NULL)
+	if (bp == NULL || BP_IS_HOLE(bp))
 		return (0);
 
 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
@@ -154,10 +168,27 @@
 	return (err);
 }
 
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
 int
 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
     void *arg, dmu_tx_t *tx)
 {
+	boolean_t ioerr = B_FALSE;
 	int err;
 	uint64_t i;
 	dmu_buf_t *db;
@@ -181,20 +212,33 @@
 	err = 0;
 	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
 		bptree_entry_phys_t bte;
+		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
 
-		ASSERT(!free || i == ba.ba_phys->bt_begin);
-
 		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
 		    &bte, DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			break;
 
+		if (zfs_free_leak_on_eio)
+			flags |= TRAVERSE_HARD;
+		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+		    "bookmark %lld/%lld/%lld/%lld",
+		    (longlong_t)i,
+		    (longlong_t)bte.be_birth_txg,
+		    (longlong_t)bte.be_zb.zb_objset,
+		    (longlong_t)bte.be_zb.zb_object,
+		    (longlong_t)bte.be_zb.zb_level,
+		    (longlong_t)bte.be_zb.zb_blkid);
 		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
-		    bte.be_birth_txg, &bte.be_zb,
-		    TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST,
+		    bte.be_birth_txg, &bte.be_zb, flags,
 		    bptree_visit_cb, &ba);
 		if (free) {
-			ASSERT(err == 0 || err == ERESTART);
+			/*
+			 * The callback has freed the visited block pointers.
+			 * Record our traversal progress on disk, either by
+			 * updating this record's bookmark, or by logically
+			 * removing this record by advancing bt_begin.
+			 */
 			if (err != 0) {
 				/* save bookmark for future resume */
 				ASSERT3U(bte.be_zb.zb_objset, ==,
@@ -202,19 +246,51 @@
 				ASSERT0(bte.be_zb.zb_level);
 				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
-				break;
-			} else {
+				if (err == EIO || err == ECKSUM ||
+				    err == ENXIO) {
+					/*
+					 * Skip the rest of this tree and
+					 * continue on to the next entry.
+					 */
+					err = 0;
+					ioerr = B_TRUE;
+				} else {
+					break;
+				}
+			} else if (ioerr) {
+				/*
+				 * This entry is finished, but there were
+				 * i/o errors on previous entries, so we
+				 * can't adjust bt_begin.  Set this entry's
+				 * be_birth_txg such that it will be
+				 * treated as a no-op in future traversals.
+				 */
+				bte.be_birth_txg = UINT64_MAX;
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+			}
+
+			if (!ioerr) {
 				ba.ba_phys->bt_begin++;
 				(void) dmu_free_range(os, obj,
 				    i * sizeof (bte), sizeof (bte), tx);
 			}
+		} else if (err != 0) {
+			break;
 		}
 	}
 
-	ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+	ASSERT(!free || err != 0 || ioerr ||
+	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
 
 	/* if all blocks are free there should be no used space */
 	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		if (zfs_free_leak_on_eio) {
+			ba.ba_phys->bt_bytes = 0;
+			ba.ba_phys->bt_comp = 0;
+			ba.ba_phys->bt_uncomp = 0;
+		}
+
 		ASSERT0(ba.ba_phys->bt_bytes);
 		ASSERT0(ba.ba_phys->bt_comp);
 		ASSERT0(ba.ba_phys->bt_uncomp);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,12 +22,16 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
+#include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
@@ -39,8 +43,19 @@
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/callb.h>
 
-static void dbuf_destroy(dmu_buf_impl_t *db);
+uint_t zfs_dbuf_evict_key;
+
+/*
+ * Number of times that zfs_free_range() took the slow path while doing
+ * a zfs receive.  A nonzero value indicates a potential performance problem.
+ */
+uint64_t zfs_free_range_recv_miss;
+
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
@@ -47,8 +62,76 @@
 /*
  * Global data structures and functions for the dbuf cache.
  */
-static kmem_cache_t *dbuf_cache;
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
 
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs are added to the dbuf cache once the last hold is released. If a
+ * dbuf is later accessed and still exists in the dbuf cache, then it will
+ * be removed from the cache and later re-added to the head of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ */
+static multilist_t dbuf_cache;
+static refcount_t dbuf_cache_size;
+uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
+
+/* Cap the size of the dbuf cache to log2 fraction of arc size. */
+int dbuf_cache_max_shift = 5;
+
+/*
+ * The dbuf cache uses a three-stage eviction policy:
+ *	- A low water marker designates when the dbuf eviction thread
+ *	should stop evicting from the dbuf cache.
+ *	- When we reach the maximum size (aka mid water mark), we
+ *	signal the eviction thread to run.
+ *	- The high water mark indicates when the eviction thread
+ *	is unable to keep up with the incoming load and eviction must
+ *	happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ *                                                 (max size)
+ *                                      low water   mid water   hi water
+ * +----------------------------------------+----------+----------+
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * +----------------------------------------+----------+----------+
+ *                                        stop        signal     evict
+ *                                      evicting     eviction   directly
+ *                                                    thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elments to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
+
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
@@ -58,7 +141,9 @@
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	multilist_link_init(&db->db_cache_link);
 	refcount_create(&db->db_holds);
+
 	return (0);
 }
 
@@ -69,6 +154,7 @@
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
 	refcount_destroy(&db->db_holds);
 }
 
@@ -98,8 +184,6 @@
 	return (crc);
 }
 
-#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
@@ -107,12 +191,10 @@
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_t *os = dn->dn_objset;
-	uint64_t obj = dn->dn_object;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
@@ -131,6 +213,24 @@
 	return (NULL);
 }
 
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *db = NULL;
+
+	if (dnode_hold(os, object, FTAG, &dn) == 0) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		if (dn->dn_bonus != NULL) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+	}
+	return (db);
+}
+
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
@@ -145,7 +245,7 @@
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
@@ -165,26 +265,25 @@
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, 1);
+	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
-	 * We musn't hold db_mtx to maintin lock ordering:
+	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
@@ -200,25 +299,75 @@
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, -1);
+	atomic_dec_64(&dbuf_hash_count);
 }
 
-static arc_evict_func_t dbuf_do_evict;
+typedef enum {
+	DBVU_EVICTING,
+	DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
 
 static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+	int64_t holds;
+
+	if (db->db_user == NULL)
+		return;
+
+	/* Only data blocks support the attachment of user data. */
+	ASSERT(db->db_level == 0);
+
+	/* Clients must resolve a dbuf before attaching user data. */
+	ASSERT(db->db.db_data != NULL);
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+
+	holds = refcount_count(&db->db_holds);
+	if (verify_type == DBVU_EVICTING) {
+		/*
+		 * Immediate eviction occurs when holds == dirtycnt.
+		 * For normal eviction buffers, holds is zero on
+		 * eviction, except when dbuf_fix_old_data() calls
+		 * dbuf_clear_data().  However, the hold count can grow
+		 * during eviction even though db_mtx is held (see
+		 * dmu_bonus_hold() for an example), so we can only
+		 * test the generic invariant that holds >= dirtycnt.
+		 */
+		ASSERT3U(holds, >=, db->db_dirtycnt);
+	} else {
+		if (db->db_user_immediate_evict == TRUE)
+			ASSERT3U(holds, >=, db->db_dirtycnt);
+		else
+			ASSERT3U(holds, >, 0);
+	}
+#endif
+}
+
+static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
+	dmu_buf_user_t *dbu = db->db_user;
+
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	if (db->db_level != 0 || db->db_evict_func == NULL)
+	if (dbu == NULL)
 		return;
 
-	if (db->db_user_data_ptr_ptr)
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	db->db_evict_func(&db->db, db->db_user_ptr);
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
+	dbuf_verify_user(db, DBVU_EVICTING);
+	db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+	if (dbu->dbu_clear_on_evict_dbufp != NULL)
+		*dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+	/*
+	 * Invoke the callback from a taskq to avoid lock order reversals
+	 * and limit stack depth.
+	 */
+	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
+	    &dbu->dbu_tqent);
 }
 
 boolean_t
@@ -237,17 +386,183 @@
 	}
 }
 
-void
-dbuf_evict(dmu_buf_impl_t *db)
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db_data_pending == NULL);
+	dmu_buf_impl_t *db = obj;
 
-	dbuf_clear(db);
-	dbuf_destroy(db);
+	/*
+	 * The assumption here, is the hash value for a given
+	 * dmu_buf_impl_t will remain constant throughout it's lifetime
+	 * (i.e. it's objset, object, level and blkid fields don't change).
+	 * Thus, we don't need to store the dbuf's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (dbuf_hash(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid) %
+	    multilist_get_num_sublists(ml));
 }
 
+static inline boolean_t
+dbuf_cache_above_hiwater(void)
+{
+	uint64_t dbuf_cache_hiwater_bytes =
+	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
+
+	return (refcount_count(&dbuf_cache_size) >
+	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+	uint64_t dbuf_cache_lowater_bytes =
+	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
+
+	return (refcount_count(&dbuf_cache_size) >
+	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+	int idx = multilist_get_random_index(&dbuf_cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
+
+	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+	/*
+	 * Set the thread's tsd to indicate that it's processing evictions.
+	 * Once a thread stops evicting from the dbuf cache it will
+	 * reset its tsd to NULL.
+	 */
+	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
+	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
+
+	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+		db = multilist_sublist_prev(mls, db);
+	}
+
+	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+	    multilist_sublist_t *, mls);
+
+	if (db != NULL) {
+		multilist_sublist_remove(mls, db);
+		multilist_sublist_unlock(mls);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+		dbuf_destroy(db);
+	} else {
+		multilist_sublist_unlock(mls);
+	}
+	(void) tsd_set(zfs_dbuf_evict_key, NULL);
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+static void
+dbuf_evict_thread(void *dummy __unused)
+{
+	callb_cpr_t cpr;
+
+	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&dbuf_evict_lock);
+	while (!dbuf_evict_thread_exit) {
+		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait_hires(&dbuf_evict_cv,
+			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+		}
+		mutex_exit(&dbuf_evict_lock);
+
+		/*
+		 * Keep evicting as long as we're above the low water mark
+		 * for the cache. We do this without holding the locks to
+		 * minimize lock contention.
+		 */
+		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			dbuf_evict_one();
+		}
+
+		mutex_enter(&dbuf_evict_lock);
+	}
+
+	dbuf_evict_thread_exit = B_FALSE;
+	cv_broadcast(&dbuf_evict_cv);
+	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
+	thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(void)
+{
+
+	/*
+	 * We use thread specific data to track when a thread has
+	 * started processing evictions. This allows us to avoid deeply
+	 * nested stacks that would have a call flow similar to this:
+	 *
+	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+	 *	^						|
+	 *	|						|
+	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+	 *
+	 * The dbuf_eviction_thread will always have its tsd set until
+	 * that thread exits. All other threads will only set their tsd
+	 * if they are participating in the eviction process. This only
+	 * happens if the eviction thread is unable to process evictions
+	 * fast enough. To keep the dbuf cache size in check, other threads
+	 * can evict from the dbuf cache directly. Those threads will set
+	 * their tsd values so that we ensure that they only evict one dbuf
+	 * from the dbuf cache.
+	 */
+	if (tsd_get(zfs_dbuf_evict_key) != NULL)
+		return;
+
+	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+		boolean_t evict_now = B_FALSE;
+
+		mutex_enter(&dbuf_evict_lock);
+		if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+			evict_now = dbuf_cache_above_hiwater();
+			cv_signal(&dbuf_evict_cv);
+		}
+		mutex_exit(&dbuf_evict_lock);
+
+		if (evict_now) {
+			dbuf_evict_one();
+		}
+	}
+}
+
 void
 dbuf_init(void)
 {
@@ -273,12 +588,38 @@
 		goto retry;
 	}
 
-	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Setup the parameters for the dbuf cache. We cap the size of the
+	 * dbuf cache to 1/32nd (default) of the size of the ARC.
+	 */
+	dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
+	    arc_max_bytes() >> dbuf_cache_max_shift);
+
+	/*
+	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+	 * configuration is not required.
+	 */
+	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
+
+	multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_cache_link),
+	    zfs_arc_num_sublists_per_state,
+	    dbuf_cache_multilist_index_func);
+	refcount_create(&dbuf_cache_size);
+
+	tsd_create(&zfs_dbuf_evict_key, NULL);
+	dbuf_evict_thread_exit = B_FALSE;
+	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+	    NULL, 0, &p0, TS_RUN, minclsyspri);
 }
 
 void
@@ -290,7 +631,23 @@
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-	kmem_cache_destroy(dbuf_cache);
+	kmem_cache_destroy(dbuf_kmem_cache);
+	taskq_destroy(dbu_evict_taskq);
+
+	mutex_enter(&dbuf_evict_lock);
+	dbuf_evict_thread_exit = B_TRUE;
+	while (dbuf_evict_thread_exit) {
+		cv_signal(&dbuf_evict_cv);
+		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+	}
+	mutex_exit(&dbuf_evict_lock);
+	tsd_destroy(&zfs_dbuf_evict_key);
+
+	mutex_destroy(&dbuf_evict_lock);
+	cv_destroy(&dbuf_evict_cv);
+
+	refcount_destroy(&dbuf_cache_size);
+	multilist_destroy(&dbuf_cache);
 }
 
 /*
@@ -321,7 +678,7 @@
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
-		    !list_is_empty(&dn->dn_dbufs));
+		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
@@ -393,13 +750,49 @@
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
+		 *
+		 * There is an exception to this rule for indirect blocks; in
+		 * this case, if the indirect block is a hole, we fill in a few
+		 * fields on each of the child blocks (importantly, birth time)
+		 * to prevent hole birth times from being lost when you
+		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
-			uint64_t *buf = db->db.db_data;
-			int i;
+			if (db->db_level == 0) {
+				uint64_t *buf = db->db.db_data;
+				int i;
 
-			for (i = 0; i < db->db.db_size >> 3; i++) {
-				ASSERT(buf[i] == 0);
+				for (i = 0; i < db->db.db_size >> 3; i++) {
+					ASSERT(buf[i] == 0);
+				}
+			} else {
+				blkptr_t *bps = db->db.db_data;
+				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+				    db->db.db_size);
+				/*
+				 * We want to verify that all the blkptrs in the
+				 * indirect block are holes, but we may have
+				 * automatically set up a few fields for them.
+				 * We iterate through each blkptr and verify
+				 * they only have those fields set.
+				 */
+				for (int i = 0;
+				    i < db->db.db_size / sizeof (blkptr_t);
+				    i++) {
+					blkptr_t *bp = &bps[i];
+					ASSERT(ZIO_CHECKSUM_IS_ZERO(
+					    &bp->blk_cksum));
+					ASSERT(
+					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[2]));
+					ASSERT0(bp->blk_fill);
+					ASSERT0(bp->blk_pad[0]);
+					ASSERT0(bp->blk_pad[1]);
+					ASSERT(!BP_IS_EMBEDDED(bp));
+					ASSERT(BP_IS_HOLE(bp));
+					ASSERT0(bp->blk_phys_birth);
+				}
 			}
 		}
 	}
@@ -408,13 +801,14 @@
 #endif
 
 static void
-dbuf_update_data(dmu_buf_impl_t *db)
+dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
-		ASSERT(!refcount_is_zero(&db->db_holds));
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	}
+	dbuf_evict_user(db);
+	ASSERT3P(db->db_buf, ==, NULL);
+	db->db.db_data = NULL;
+	if (db->db_state != DB_NOFILL)
+		db->db_state = DB_UNCACHED;
 }
 
 static void
@@ -421,20 +815,11 @@
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+	ASSERT(buf != NULL);
+
 	db->db_buf = buf;
-	if (buf != NULL) {
-		ASSERT(buf->b_data != NULL);
-		db->db.db_data = buf->b_data;
-		if (!arc_released(buf))
-			arc_set_callback(buf, dbuf_do_evict, db);
-		dbuf_update_data(db);
-	} else {
-		dbuf_evict_user(db);
-		db->db.db_data = NULL;
-		if (db->db_state != DB_NOFILL)
-			db->db_state = DB_UNCACHED;
-	}
+	ASSERT(buf->b_data != NULL);
+	db->db.db_data = buf->b_data;
 }
 
 /*
@@ -445,29 +830,54 @@
 {
 	arc_buf_t *abuf;
 
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
-		spa_t *spa;
+		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
-		DB_GET_SPA(&spa, db);
 		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
-		dbuf_set_data(db, NULL);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
 uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
-	if (dn->dn_datablkshift) {
-		return (offset >> dn->dn_datablkshift);
+	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+		/*
+		 * The level n blkid is equal to the level 0 blkid divided by
+		 * the number of level 0s in a level n block.
+		 *
+		 * The level 0 blkid is offset >> datablkshift =
+		 * offset / 2^datablkshift.
+		 *
+		 * The number of level 0s in a level n is the number of block
+		 * pointers in an indirect block, raised to the power of level.
+		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+		 *
+		 * Thus, the level n blkid is: offset /
+		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+		 * = offset / 2^(datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 * = offset >> (datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 */
+		return (offset >> (dn->dn_datablkshift + level *
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
@@ -501,7 +911,7 @@
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db));
+		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
@@ -509,12 +919,11 @@
 }
 
 static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	dnode_t *dn;
-	spa_t *spa;
-	zbookmark_t zb;
-	uint32_t aflags = ARC_NOWAIT;
+	zbookmark_phys_t zb;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
@@ -536,7 +945,6 @@
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		DB_DNODE_EXIT(db);
-		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
@@ -552,17 +960,36 @@
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
+		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
 		    db->db.db_size, db, type));
+		bzero(db->db.db_data, db->db.db_size);
+
+		if (db->db_blkptr != NULL && db->db_level > 0 &&
+		    BP_IS_HOLE(db->db_blkptr) &&
+		    db->db_blkptr->blk_birth != 0) {
+			blkptr_t *bps = db->db.db_data;
+			for (int i = 0; i < ((1 <<
+			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
+			    i++) {
+				blkptr_t *bp = &bps[i];
+				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+				    1 << dn->dn_indblkshift);
+				BP_SET_LSIZE(bp,
+				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
+				    dn->dn_datablksz :
+				    BP_GET_LSIZE(db->db_blkptr));
+				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+				BP_SET_LEVEL(bp,
+				    BP_GET_LEVEL(db->db_blkptr) - 1);
+				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+			}
+		}
 		DB_DNODE_EXIT(db);
-		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
-		*flags |= DB_RF_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
-	spa = dn->dn_objset->os_spa;
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
@@ -569,9 +996,7 @@
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
-		aflags |= ARC_L2CACHE;
-	if (DBUF_IS_L2COMPRESSIBLE(db))
-		aflags |= ARC_L2COMPRESS;
+		aflags |= ARC_FLAG_L2CACHE;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@@ -579,12 +1004,10 @@
 
 	dbuf_add_ref(db, NULL);
 
-	(void) arc_read(zio, spa, db->db_blkptr,
+	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
-	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
-	if (aflags & ARC_CACHED)
-		*flags |= DB_RF_CACHED;
 }
 
 int
@@ -591,8 +1014,8 @@
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
-	int havepzio = (zio != NULL);
-	int prefetch;
+	boolean_t havepzio = (zio != NULL);
+	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
@@ -617,8 +1040,7 @@
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
@@ -627,13 +1049,12 @@
 
 		if (zio == NULL)
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-		dbuf_read_impl(db, zio, &flags);
+		dbuf_read_impl(db, zio, flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, flags & DB_RF_CACHED);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
@@ -652,8 +1073,7 @@
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
@@ -665,6 +1085,8 @@
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
+				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@@ -687,15 +1109,14 @@
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		spa_t *spa;
+		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
-		DB_GET_SPA(&spa, db);
-		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
+		dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
-		dbuf_set_data(db, NULL);
+		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
@@ -746,13 +1167,13 @@
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		spa_t *spa;
+		spa_t *spa = db->db_objset->os_spa;
 
-		DB_GET_SPA(&spa, db);
-		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
+		dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
-		dbuf_set_data(db, NULL);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
 	}
 }
 
@@ -774,12 +1195,9 @@
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
-	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
-		spa_t *spa;
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+		zio_free(db->db_objset->os_spa, txg, bp);
 
-		DB_GET_SPA(&spa, db);
-		zio_free(spa, txg, bp);
-	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 
@@ -797,49 +1215,63 @@
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
- * empty blocks.  Also, if we happen accross any level-1 dbufs in the
- * range that have not already been marked dirty, mark them dirty so
- * they stay in memory.
+ * empty blocks.
+ *
+ * This is a no-op if the dataset is in the middle of an incremental
+ * receive; see comment below for details.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
+	dmu_buf_impl_t db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
-	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	uint64_t first_l1 = start >> epbs;
-	uint64_t last_l1 = end >> epbs;
+	avl_index_t where;
 
-	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
-		end = dn->dn_maxblkid;
-		last_l1 = end >> epbs;
+	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
+		end_blkid = dn->dn_maxblkid;
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+	db_search.db_level = 0;
+	db_search.db_blkid = start_blkid;
+	db_search.db_state = DB_SEARCH;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
+		/* There can't be any dbufs in this range; no need to search. */
+#ifdef DEBUG
+		db = avl_find(&dn->dn_dbufs, &db_search, &where);
+		ASSERT3P(db, ==, NULL);
+		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+		ASSERT(db == NULL || db->db_level > 0);
+#endif
+		mutex_exit(&dn->dn_dbufs_mtx);
+		return;
+	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
+		/*
+		 * If we are receiving, we expect there to be no dbufs in
+		 * the range to be freed, because receive modifies each
+		 * block at most once, and in offset order.  If this is
+		 * not the case, it can lead to performance problems,
+		 * so note that we unexpectedly took the slow path.
+		 */
+		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
-	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
-	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
+
+	db = avl_find(&dn->dn_dbufs, &db_search, &where);
+	ASSERT3P(db, ==, NULL);
+	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+	for (; db != NULL; db = db_next) {
+		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-		if (db->db_level == 1 &&
-		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
-			mutex_enter(&db->db_mtx);
-			if (db->db_last_dirty &&
-			    db->db_last_dirty->dr_txg < txg) {
-				dbuf_add_ref(db, FTAG);
-				mutex_exit(&db->db_mtx);
-				dbuf_will_dirty(db, tx);
-				dbuf_rele(db, FTAG);
-			} else {
-				mutex_exit(&db->db_mtx);
-			}
+		if (db->db_level != 0 || db->db_blkid > end_blkid) {
+			break;
 		}
+		ASSERT3U(db->db_blkid, >=, start_blkid);
 
-		if (db->db_level != 0)
-			continue;
-		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < start || db->db_blkid > end)
-			continue;
-
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
@@ -862,7 +1294,7 @@
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
-			dbuf_clear(db);
+			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
@@ -913,24 +1345,29 @@
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
+	 *
+	 * This logic ensures that only block births for
+	 * filled blocks are considered.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_last_dirty)
+	if (db->db_last_dirty && (db->db_blkptr == NULL ||
+	    !BP_IS_HOLE(db->db_blkptr))) {
 		birth_txg = db->db_last_dirty->dr_txg;
-	else if (db->db_blkptr)
+	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 		birth_txg = db->db_blkptr->blk_birth;
+	}
 
 	/*
-	 * If we don't exist or are in a snapshot, we can't be freed.
+	 * If this block don't exist or is in a snapshot, it can't be freed.
 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
 	 * are holding the db_mtx lock and might deadlock if we are
 	 * prefetching a dedup-ed block.
 	 */
-	if (birth_txg)
+	if (birth_txg != 0)
 		return (ds == NULL ||
 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
 	else
-		return (FALSE);
+		return (B_FALSE);
 }
 
 void
@@ -950,7 +1387,7 @@
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
-	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
@@ -959,10 +1396,10 @@
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
-	dbuf_will_dirty(db, tx);
+	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
-	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
+	buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
@@ -973,7 +1410,7 @@
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
-	VERIFY(arc_buf_remove_ref(obuf, db));
+	arc_buf_destroy(obuf, db);
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
@@ -989,9 +1426,8 @@
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
-	objset_t *os;
+	objset_t *os = db->db_objset;
 
-	DB_GET_OBJSET(&os, db);
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -1000,6 +1436,32 @@
 	(void) arc_release(db->db_buf, db);
 }
 
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+		/*
+		 * If this buffer has already been written out,
+		 * we now need to reset its state.
+		 */
+		dbuf_unoverride(dr);
+		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+		    db->db_state != DB_NOFILL) {
+			/* Already released on initial dirty, so just thaw. */
+			ASSERT(arc_released(db->db_buf));
+			arc_buf_thaw(db->db_buf);
+		}
+	}
+}
+
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
@@ -1021,10 +1483,18 @@
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
+#ifdef DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL) {
+		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+		    RW_READER, FTAG);
+	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
@@ -1049,12 +1519,21 @@
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
-	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-		dn->dn_dirtyctx =
-		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
-		ASSERT(dn->dn_dirtyctx_firstset == NULL);
-		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+		if (dn->dn_objset->os_dsl_dataset != NULL) {
+			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+			    RW_READER, FTAG);
+		}
+		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
+			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+			ASSERT(dn->dn_dirtyctx_firstset == NULL);
+			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+		}
+		if (dn->dn_objset->os_dsl_dataset != NULL) {
+			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+			    FTAG);
+		}
 	}
 	mutex_exit(&dn->dn_mtx);
 
@@ -1072,16 +1551,7 @@
 	if (dr && dr->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
-		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
-			/*
-			 * If this buffer has already been written out,
-			 * we now need to reset its state.
-			 */
-			dbuf_unoverride(dr);
-			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
-			    db->db_state != DB_NOFILL)
-				arc_buf_thaw(db->db_buf);
-		}
+		dbuf_redirty(dr);
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
@@ -1108,8 +1578,14 @@
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
+#ifdef DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1162,6 +1638,8 @@
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
+	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
@@ -1175,7 +1653,10 @@
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		if (dn->dn_free_ranges[txgoff] != NULL) {
+			range_tree_clear(dn->dn_free_ranges[txgoff],
+			    db->db_blkid, 1);
+		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
@@ -1198,7 +1679,20 @@
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
-	} else if (do_free_accounting) {
+	}
+
+	/*
+	 * The dn_struct_rwlock prevents db_blkptr from changing
+	 * due to a write from syncing context completing
+	 * while we are running, so we want to acquire it before
+	 * looking at db_blkptr.
+	 */
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
@@ -1214,11 +1708,6 @@
 		dnode_willuse_space(dn, -willfree, tx);
 	}
 
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
@@ -1245,7 +1734,10 @@
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
-		/*  possible race with dbuf_undirty() */
+		/*
+		 * Since we've dropped the mutex, it's possible that
+		 * dbuf_undirty() might have changed this out from under us.
+		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
@@ -1285,6 +1777,16 @@
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
+
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1303,19 +1805,12 @@
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
-	/*
-	 * Note:  This code will probably work even if there are concurrent
-	 * holders, but it is untested in that scenerio, as the ZPL and
-	 * ztest have additional locking (the range locks) that prevents
-	 * that type of concurrent access.
-	 */
-	ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
-
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
-	/* XXX would be nice to fix up dn_towrite_space[] */
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
@@ -1330,7 +1825,7 @@
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
-	    db->db_level+1 == dn->dn_nlevels) {
+	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1344,8 +1839,9 @@
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
-			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 	}
+
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
@@ -1352,12 +1848,8 @@
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		arc_buf_t *buf = db->db_buf;
-
-		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
-		dbuf_set_data(db, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db));
-		dbuf_evict(db);
+		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+		dbuf_destroy(db);
 		return (B_TRUE);
 	}
 
@@ -1364,15 +1856,39 @@
 	return (B_FALSE);
 }
 
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
 void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
+	/*
+	 * Quick check for dirtyness.  For already dirty blocks, this
+	 * reduces runtime of this function by >90%, and overall performance
+	 * by 50% for some workloads (e.g. file deletion with indirect blocks
+	 * cached).
+	 */
+	mutex_enter(&db->db_mtx);
+	dbuf_dirty_record_t *dr;
+	for (dr = db->db_last_dirty;
+	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+		/*
+		 * It's possible that it is already dirty but not cached,
+		 * because there are some calls to dbuf_dirty() that don't
+		 * go through dmu_buf_will_dirty().
+		 */
+		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
+			/* This dbuf is already dirty and cached. */
+			dbuf_redirty(dr);
+			mutex_exit(&db->db_mtx);
+			return;
+		}
+	}
+	mutex_exit(&db->db_mtx);
+
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
@@ -1430,6 +1946,43 @@
 	mutex_exit(&db->db_mtx);
 }
 
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	struct dirty_leaf *dl;
+	dmu_object_type_t type;
+
+	if (etype == BP_EMBEDDED_TYPE_DATA) {
+		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+		    SPA_FEATURE_EMBEDDED_DATA));
+	}
+
+	DB_DNODE_ENTER(db);
+	type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	ASSERT0(db->db_level);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	dmu_buf_will_not_fill(dbuf, tx);
+
+	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+	dl = &db->db_last_dirty->dt.dl;
+	encode_embedded_bp_compressed(&dl->dr_overridden_by,
+	    data, comp, uncompressed_size, compressed_size);
+	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+	BP_SET_TYPE(&dl->dr_overridden_by, type);
+	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+	dl->dr_override_state = DR_OVERRIDDEN;
+	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1460,7 +2013,7 @@
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
-		VERIFY(arc_buf_remove_ref(buf, db));
+		arc_buf_destroy(buf, db);
 		xuio_stat_wbuf_copied();
 		return;
 	}
@@ -1478,10 +2031,10 @@
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
-			VERIFY(arc_buf_remove_ref(db->db_buf, db));
+			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
-			VERIFY(arc_buf_remove_ref(db->db_buf, db));
+			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	}
@@ -1490,45 +2043,39 @@
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
-	dbuf_fill_done(db, tx);
+	dmu_buf_fill_done(&db->db, tx);
 }
 
-/*
- * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
- * when we are not holding the dn_dbufs_mtx, we can't clear the
- * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
- * in this case.  For callers from the DMU we will usually see:
- *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
- * For the arc callback, we will usually see:
- *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
- * Sometimes, though, we will get a mix of these two:
- *	DMU: dbuf_clear()->arc_buf_evict()
- *	ARC: dbuf_do_evict()->dbuf_destroy()
- */
 void
-dbuf_clear(dmu_buf_impl_t *db)
+dbuf_destroy(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
-	int dbuf_gone = FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
-	dbuf_evict_user(db);
+	if (db->db_buf != NULL) {
+		arc_buf_destroy(db->db_buf, db);
+		db->db_buf = NULL;
+	}
 
-	if (db->db_state == DB_CACHED) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(db->db.db_data != NULL);
-		if (db->db_blkid == DMU_BONUS_BLKID) {
-			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
-			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
-		}
-		db->db.db_data = NULL;
+		zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		db->db_state = DB_UNCACHED;
 	}
 
+	dbuf_clear_data(db);
+
+	if (multilist_link_active(&db->db_cache_link)) {
+		multilist_remove(&dbuf_cache, db);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+	}
+
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
@@ -1535,14 +2082,26 @@
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
+	/*
+	 * Now that db_state is DB_EVICTING, nobody else can find this via
+	 * the hash table.  We can now drop db_mtx, which allows us to
+	 * acquire the dn_dbufs_mtx.
+	 */
+	mutex_exit(&db->db_mtx);
+
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
-	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-		list_remove(&dn->dn_dbufs, db);
-		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+		if (needlock)
+			mutex_enter(&dn->dn_dbufs_mtx);
+		avl_remove(&dn->dn_dbufs, db);
+		atomic_dec_32(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
+		if (needlock)
+			mutex_exit(&dn->dn_dbufs_mtx);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
@@ -1553,16 +2112,26 @@
 		 */
 		dnode_rele(dn, db);
 		db->db_dnode_handle = NULL;
+
+		dbuf_hash_remove(db);
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
-	if (db->db_buf)
-		dbuf_gone = arc_buf_evict(db->db_buf);
+	ASSERT(refcount_is_zero(&db->db_holds));
 
-	if (!dbuf_gone)
-		mutex_exit(&db->db_mtx);
+	db->db_parent = NULL;
 
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
+
+	kmem_cache_free(dbuf_kmem_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
@@ -1571,6 +2140,12 @@
 		dbuf_rele(parent, db);
 }
 
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
@@ -1611,7 +2186,7 @@
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
-		    blkid >> epbs, fail_sparse, NULL, parentp);
+		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
@@ -1648,7 +2223,7 @@
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
-	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
@@ -1660,11 +2235,10 @@
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
-	db->db_immediate_evict = 0;
-	db->db_freed_in_flight = 0;
+	db->db_user = NULL;
+	db->db_user_immediate_evict = FALSE;
+	db->db_freed_in_flight = FALSE;
+	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -1682,7 +2256,7 @@
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
-		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
@@ -1698,11 +2272,14 @@
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
-		kmem_cache_free(dbuf_cache, db);
+		kmem_cache_free(dbuf_kmem_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
-	list_insert_head(&dn->dn_dbufs, db);
+	avl_add(&dn->dn_dbufs, db);
+	if (db->db_level == 0 && db->db_blkid >=
+	    dn->dn_unlisted_l0_blkid)
+		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
@@ -1713,7 +2290,7 @@
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
-	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+	atomic_inc_32(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
@@ -1720,115 +2297,246 @@
 	return (db);
 }
 
-static int
-dbuf_do_evict(void *private)
+typedef struct dbuf_prefetch_arg {
+	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
+	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+	int dpa_curlevel; /* The current level that we're reading */
+	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
-	arc_buf_t *buf = private;
-	dmu_buf_impl_t *db = buf->b_private;
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return;
 
-	if (!MUTEX_HELD(&db->db_mtx))
-		mutex_enter(&db->db_mtx);
+	arc_flags_t aflags =
+	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
-	ASSERT(refcount_is_zero(&db->db_holds));
-
-	if (db->db_state != DB_EVICTING) {
-		ASSERT(db->db_state == DB_CACHED);
-		DBUF_VERIFY(db);
-		db->db_buf = NULL;
-		dbuf_evict(db);
-	} else {
-		mutex_exit(&db->db_mtx);
-		dbuf_destroy(db);
-	}
-	return (0);
+	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+	ASSERT(dpa->dpa_zio != NULL);
+	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+	    &aflags, &dpa->dpa_zb);
 }
 
+/*
+ * Called when an indirect block above our prefetch target is read in.  This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
 static void
-dbuf_destroy(dmu_buf_impl_t *db)
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
 {
-	ASSERT(refcount_is_zero(&db->db_holds));
+	dbuf_prefetch_arg_t *dpa = private;
 
-	if (db->db_blkid != DMU_BONUS_BLKID) {
-		/*
-		 * If this dbuf is still on the dn_dbufs list,
-		 * remove it from that list.
-		 */
-		if (db->db_dnode_handle != NULL) {
-			dnode_t *dn;
+	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
-			DB_DNODE_ENTER(db);
-			dn = DB_DNODE(db);
-			mutex_enter(&dn->dn_dbufs_mtx);
-			list_remove(&dn->dn_dbufs, db);
-			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
-			mutex_exit(&dn->dn_dbufs_mtx);
-			DB_DNODE_EXIT(db);
-			/*
-			 * Decrementing the dbuf count means that the hold
-			 * corresponding to the removed dbuf is no longer
-			 * discounted in dnode_move(), so the dnode cannot be
-			 * moved until after we release the hold.
-			 */
-			dnode_rele(dn, db);
-			db->db_dnode_handle = NULL;
+	/*
+	 * The dpa_dnode is only valid if we are called with a NULL
+	 * zio. This indicates that the arc_read() returned without
+	 * first calling zio_read() to issue a physical read. Once
+	 * a physical read is made the dpa_dnode must be invalidated
+	 * as the locks guarding it may have been dropped. If the
+	 * dpa_dnode is still valid, then we want to add it to the dbuf
+	 * cache. To do so, we must hold the dbuf associated with the block
+	 * we just prefetched, read its contents so that we associate it
+	 * with an arc_buf_t, and then release it.
+	 */
+	if (zio != NULL) {
+		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+		if (zio->io_flags & ZIO_FLAG_RAW) {
+			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+		} else {
+			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
-		dbuf_hash_remove(db);
+		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+
+		dpa->dpa_dnode = NULL;
+	} else if (dpa->dpa_dnode != NULL) {
+		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
+		    dpa->dpa_zb.zb_level));
+		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+		    dpa->dpa_curlevel, curblkid, FTAG);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+		dbuf_rele(db, FTAG);
 	}
-	db->db_parent = NULL;
-	db->db_buf = NULL;
 
-	ASSERT(!list_link_active(&db->db_link));
-	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_hash_next == NULL);
-	ASSERT(db->db_blkptr == NULL);
-	ASSERT(db->db_data_pending == NULL);
+	dpa->dpa_curlevel--;
 
-	kmem_cache_free(dbuf_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+		kmem_free(dpa, sizeof (*dpa));
+	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+		dbuf_issue_final_prefetch(dpa, bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
+			iter_aflags |= ARC_FLAG_L2CACHE;
+
+		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+
+	arc_buf_destroy(abuf, private);
 }
 
+/*
+ * Issue prefetch reads for the given block on the given level.  If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously.  As a result, this call never blocks waiting for a read to
+ * complete.
+ */
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
 {
-	dmu_buf_impl_t *db = NULL;
-	blkptr_t *bp = NULL;
+	blkptr_t bp;
+	int epbs, nlevels, curlevel;
+	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
+	if (blkid > dn->dn_maxblkid)
+		return;
+
 	if (dnode_block_freed(dn, blkid))
 		return;
 
-	/* dbuf_find() returns with db_mtx held */
-	if (db = dbuf_find(dn, 0, blkid)) {
+	/*
+	 * This dnode hasn't been written to disk yet, so there's nothing to
+	 * prefetch.
+	 */
+	nlevels = dn->dn_phys->dn_nlevels;
+	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+		return;
+
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+		return;
+
+	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+	    level, blkid);
+	if (db != NULL) {
+		mutex_exit(&db->db_mtx);
 		/*
-		 * This dbuf is already in the cache.  We assume that
-		 * it is already CACHED, or else about to be either
-		 * read or filled.
+		 * This dbuf already exists.  It is either CACHED, or
+		 * (we assume) about to be read or filled.
 		 */
-		mutex_exit(&db->db_mtx);
 		return;
 	}
 
-	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
-		if (bp && !BP_IS_HOLE(bp)) {
-			int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
-			    ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-			zbookmark_t zb;
+	/*
+	 * Find the closest ancestor (indirect block) of the target block
+	 * that is present in the cache.  In this indirect block, we will
+	 * find the bp that is at curlevel, curblkid.
+	 */
+	curlevel = level;
+	curblkid = blkid;
+	while (curlevel < nlevels - 1) {
+		int parent_level = curlevel + 1;
+		uint64_t parent_blkid = curblkid >> epbs;
+		dmu_buf_impl_t *db;
 
-			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-			    dn->dn_object, 0, blkid);
+		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+		    FALSE, TRUE, FTAG, &db) == 0) {
+			blkptr_t *bpp = db->db_buf->b_data;
+			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+			dbuf_rele(db, FTAG);
+			break;
+		}
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa,
-			    bp, NULL, NULL, priority,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		if (db)
-			dbuf_rele(db, NULL);
+		curlevel = parent_level;
+		curblkid = parent_blkid;
 	}
+
+	if (curlevel == nlevels - 1) {
+		/* No cached indirect blocks found. */
+		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+		bp = dn->dn_phys->dn_blkptr[curblkid];
+	}
+	if (BP_IS_HOLE(&bp))
+		return;
+
+	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
+
+	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+	    dn->dn_object, level, blkid);
+	dpa->dpa_curlevel = curlevel;
+	dpa->dpa_prio = prio;
+	dpa->dpa_aflags = aflags;
+	dpa->dpa_spa = dn->dn_objset->os_spa;
+	dpa->dpa_dnode = dn;
+	dpa->dpa_epbs = epbs;
+	dpa->dpa_zio = pio;
+
+	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
+
+	/*
+	 * If we have the indirect just above us, no need to do the asynchronous
+	 * prefetch chain; we'll just run the last step ourselves.  If we're at
+	 * a higher level, though, we want to issue the prefetches for all the
+	 * indirect blocks asynchronously, so we can go on with whatever we were
+	 * doing.
+	 */
+	if (curlevel == level) {
+		ASSERT3U(curblkid, ==, blkid);
+		dbuf_issue_final_prefetch(dpa, &bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+			iter_aflags |= ARC_FLAG_L2CACHE;
+
+		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+		    dn->dn_object, curlevel, curblkid);
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+	/*
+	 * We use pio here instead of dpa_zio since it's possible that
+	 * dpa may have already been freed.
+	 */
+	zio_nowait(pio);
 }
 
 /*
@@ -1836,7 +2544,8 @@
  * Note: dn_struct_rwlock must be held.
  */
 int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
@@ -1848,12 +2557,15 @@
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
-	db = dbuf_find(dn, level, blkid);
+	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
+		if (fail_uncached)
+			return (SET_ERROR(ENOENT));
+
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
@@ -1870,19 +2582,14 @@
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
-	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
-		arc_buf_add_ref(db->db_buf, db);
-		if (db->db_buf->b_data == NULL) {
-			dbuf_clear(db);
-			if (parent) {
-				dbuf_rele(parent, NULL);
-				parent = NULL;
-			}
-			goto top;
-		}
-		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+	if (fail_uncached && db->db_state != DB_CACHED) {
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
 	}
 
+	if (db->db_buf != NULL)
+		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
@@ -1899,7 +2606,7 @@
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
-			    arc_buf_alloc(dn->dn_objset->os_spa,
+			    arc_alloc_buf(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
@@ -1906,8 +2613,13 @@
 		}
 	}
 
+	if (multilist_link_active(&db->db_cache_link)) {
+		ASSERT(refcount_is_zero(&db->db_holds));
+		multilist_remove(&dbuf_cache, db);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+	}
 	(void) refcount_add(&db->db_holds, tag);
-	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
@@ -1926,9 +2638,7 @@
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
+	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
@@ -1935,7 +2645,7 @@
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
@@ -1958,10 +2668,8 @@
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
-	if (blksz > SPA_MAXBLOCKSIZE)
-		blksz = SPA_MAXBLOCKSIZE;
-	else
-		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
@@ -1984,9 +2692,33 @@
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
-	ASSERT(holds > 1);
+	ASSERT3S(holds, >, 1);
 }
 
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+    void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dmu_buf_impl_t *found_db;
+	boolean_t result = B_FALSE;
+
+	if (db->db_blkid == DMU_BONUS_BLKID)
+		found_db = dbuf_find_bonus(os, obj);
+	else
+		found_db = dbuf_find(os, obj, 0, blkid);
+
+	if (found_db != NULL) {
+		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+			(void) refcount_add(&db->db_holds, tag);
+			result = B_TRUE;
+		}
+		mutex_exit(&db->db_mtx);
+	}
+	return (result);
+}
+
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
@@ -1994,7 +2726,6 @@
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
-#pragma weak dmu_buf_rele = dbuf_rele
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
@@ -2002,6 +2733,12 @@
 	dbuf_rele_and_unlock(db, tag);
 }
 
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
@@ -2026,30 +2763,47 @@
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
-	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+	if (db->db_buf != NULL &&
+	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
+	}
 
 	if (holds == db->db_dirtycnt &&
-	    db->db_level == 0 && db->db_immediate_evict)
+	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
-			mutex_exit(&db->db_mtx);
+			dnode_t *dn;
+			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
-			 * If the dnode moves here, we cannot cross this barrier
-			 * until the move completes.
+			 * If the dnode moves here, we cannot cross this
+			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
-			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+
+			dn = DB_DNODE(db);
+			atomic_dec_32(&dn->dn_dbufs_count);
+
+			/*
+			 * Decrementing the dbuf count means that the bonus
+			 * buffer's dnode hold is no longer discounted in
+			 * dnode_move(). The dnode cannot move until after
+			 * the dnode_rele() below.
+			 */
 			DB_DNODE_EXIT(db);
+
 			/*
-			 * The bonus buffer's dnode hold is no longer discounted
-			 * in dnode_move(). The dnode cannot move until after
-			 * the dnode_rele().
+			 * Do not reference db after its lock is dropped.
+			 * Another thread may evict it.
 			 */
-			dnode_rele(DB_DNODE(db), db);
+			mutex_exit(&db->db_mtx);
+
+			if (evict_dbuf)
+				dnode_evict_bonus(dn);
+
+			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
@@ -2057,42 +2811,44 @@
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
-			dbuf_evict(db);
+			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
-			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
-			dbuf_set_data(db, NULL);
-			VERIFY(arc_buf_remove_ref(buf, db));
-			dbuf_evict(db);
+			dbuf_destroy(db);
 		} else {
-			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
+			boolean_t do_arc_evict = B_FALSE;
+			blkptr_t bp;
+			spa_t *spa = dmu_objset_spa(db->db_objset);
 
-			/*
-			 * A dbuf will be eligible for eviction if either the
-			 * 'primarycache' property is set or a duplicate
-			 * copy of this buffer is already cached in the arc.
-			 *
-			 * In the case of the 'primarycache' a buffer
-			 * is considered for eviction if it matches the
-			 * criteria set in the property.
-			 *
-			 * To decide if our buffer is considered a
-			 * duplicate, we must call into the arc to determine
-			 * if multiple buffers are referencing the same
-			 * block on-disk. If so, then we simply evict
-			 * ourselves.
-			 */
+			if (!DBUF_IS_CACHEABLE(db) &&
+			    db->db_blkptr != NULL &&
+			    !BP_IS_HOLE(db->db_blkptr) &&
+			    !BP_IS_EMBEDDED(db->db_blkptr)) {
+				do_arc_evict = B_TRUE;
+				bp = *db->db_blkptr;
+			}
+
 			if (!DBUF_IS_CACHEABLE(db) ||
-			    arc_buf_eviction_needed(db->db_buf))
-				dbuf_clear(db);
-			else
+			    db->db_pending_evict) {
+				dbuf_destroy(db);
+			} else if (!multilist_link_active(&db->db_cache_link)) {
+				multilist_insert(&dbuf_cache, db);
+				(void) refcount_add_many(&dbuf_cache_size,
+				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
+
+				dbuf_evict_notify();
+			}
+
+			if (do_arc_evict)
+				arc_freed(spa, &bp);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
+
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
@@ -2103,47 +2859,42 @@
 }
 
 void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
 {
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	if (db->db_user == old_user)
+		db->db_user = new_user;
+	else
+		old_user = db->db_user;
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	mutex_exit(&db->db_mtx);
+
+	return (old_user);
 }
 
 void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	db->db_immediate_evict = TRUE;
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(db->db_level == 0);
 
-	ASSERT((user_ptr == NULL) == (evict_func == NULL));
+	db->db_user_immediate_evict = TRUE;
+	return (dmu_buf_set_user(db_fake, user));
+}
 
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_user_ptr == old_user_ptr) {
-		db->db_user_ptr = user_ptr;
-		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
-		db->db_evict_func = evict_func;
-
-		dbuf_update_data(db);
-	} else {
-		old_user_ptr = db->db_user_ptr;
-	}
-
-	mutex_exit(&db->db_mtx);
-	return (old_user_ptr);
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
@@ -2150,11 +2901,17 @@
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	return (db->db_user_ptr);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	return (db->db_user);
 }
 
+void
+dmu_buf_user_evict_wait()
+{
+	taskq_wait(dbu_evict_taskq);
+}
+
 boolean_t
 dmu_buf_freeable(dmu_buf_t *dbuf)
 {
@@ -2175,6 +2932,28 @@
 	return (dbi->db_blkptr);
 }
 
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_ENTER(dbi);
+	return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_EXIT(dbi);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -2208,8 +2987,8 @@
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, db, &parent);
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
@@ -2260,7 +3039,7 @@
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, tx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
@@ -2379,7 +3158,7 @@
 		 */
 		int blksz = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+		*datap = arc_alloc_buf(os->os_spa, blksz, db, type);
 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
@@ -2406,7 +3185,7 @@
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
@@ -2423,6 +3202,10 @@
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
+		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
@@ -2444,7 +3227,8 @@
 	uint64_t fill = 0;
 	int i;
 
-	ASSERT(db->db_blkptr == bp);
+	ASSERT3P(db->db_blkptr, !=, NULL);
+	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
@@ -2452,24 +3236,21 @@
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (BP_IS_HOLE(bp)) {
-		ASSERT(bp->blk_fill == 0);
-		DB_DNODE_EXIT(db);
-		return;
+	if (bp->blk_birth != 0) {
+		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_type) ||
+		    (db->db_blkid == DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+		    BP_IS_EMBEDDED(bp));
+		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
-	ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
-	    BP_GET_TYPE(bp) == dn->dn_type) ||
-	    (db->db_blkid == DMU_SPILL_BLKID &&
-	    BP_GET_TYPE(bp) == dn->dn_bonustype));
-	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
-		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
@@ -2489,7 +3270,11 @@
 					fill++;
 			}
 		} else {
-			fill = 1;
+			if (BP_IS_HOLE(bp)) {
+				fill = 0;
+			} else {
+				fill = 1;
+			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
@@ -2497,24 +3282,101 @@
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
-			fill += ibp->blk_fill;
+			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
-	bp->blk_fill = fill;
+	if (!BP_IS_EMBEDDED(bp))
+		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	*db->db_blkptr = *bp;
+	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
 static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
+	blkptr_t *bp;
+	uint64_t i;
+	int epbs;
+
+	ASSERT3U(db->db_level, >, 0);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	/* Determine if all our children are holes */
+	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
+	}
+
+	/*
+	 * If all the children are holes, then zero them all out so that
+	 * we may get compressed away.
+	 */
+	if (i == 1 << epbs) {
+		/* didn't find any non-holes */
+		bzero(db->db.db_data, db->db.db_size);
+	}
+	DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	dmu_buf_impl_t *db = arg;
+	objset_t *os = db->db_objset;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	dbuf_dirty_record_t *dr;
+	int delta = 0;
+
+	dr = db->db_data_pending;
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dsl_pool_sync()'s call to
+	 * dsl_pool_undirty_space().
+	 */
+	delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
-	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
-	uint64_t txg = zio->io_txg;
+	blkptr_t *bp = db->db_blkptr;
+	objset_t *os = db->db_objset;
+	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT0(zio->io_error);
@@ -2527,14 +3389,7 @@
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
-		objset_t *os;
-		dsl_dataset_t *ds;
-		dmu_tx_t *tx;
-
-		DB_GET_OBJSET(&os, db);
-		ds = os->os_dsl_dataset;
-		tx = os->os_synctx;
-
+		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
@@ -2547,7 +3402,6 @@
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
@@ -2570,10 +3424,7 @@
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
-				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-				    db));
-			else if (!arc_released(db->db_buf))
-				arc_set_callback(db->db_buf, dbuf_do_evict, db);
+				arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		dnode_t *dn;
@@ -2581,15 +3432,14 @@
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_blkid, <=,
+			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
-			ASSERT3U(dn->dn_phys->dn_maxblkid
-			    >> (db->db_level * epbs), >=, db->db_blkid);
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2601,7 +3451,7 @@
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
@@ -2652,11 +3502,13 @@
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int wp_flag = 0;
 
+	ASSERT(dmu_tx_is_syncing(tx));
+
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
@@ -2715,12 +3567,27 @@
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
-	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		ASSERT(db->db_state != DB_NOFILL);
+	/*
+	 * We copy the blkptr now (rather than when we instantiate the dirty
+	 * record), because its value can change between open context and
+	 * syncing context. We do not need to hold dn_struct_rwlock to read
+	 * db_blkptr because we are in syncing context.
+	 */
+	dr->dr_bp_copy = *db->db_blkptr;
+
+	if (db->db_level == 0 &&
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * The BP for this block has been provided by open context
+		 * (by dmu_sync() or dmu_buf_write_embedded()).
+		 */
+		void *contents = (data != NULL) ? data->b_data : NULL;
+
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
-		    dbuf_write_override_ready, dbuf_write_override_done, dr,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		    &dr->dr_bp_copy, contents, db->db.db_size, &zp,
+		    dbuf_write_override_ready, NULL, NULL,
+		    dbuf_write_override_done,
+		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
@@ -2727,18 +3594,30 @@
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, NULL, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+		    &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, NULL, NULL,
+		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
+
+		/*
+		 * For indirect blocks, we want to setup the children
+		 * ready callback so that we can properly handle an indirect
+		 * block that only contains holes.
+		 */
+		arc_done_func_t *children_ready_cb = NULL;
+		if (db->db_level != 0)
+			children_ready_cb = dbuf_write_children_ready;
+
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
-		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
-		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
+		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+		    &zp, dbuf_write_ready, children_ready_cb,
+		    dbuf_write_physdone, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -66,7 +66,8 @@
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
-	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
@@ -120,12 +121,12 @@
 	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 
-	if (error)
+	if (error != 0)
 		return (error);
 
-	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class]);
+	    &ddt->ddt_histogram[type][class]));
 
 	/*
 	 * Seed the cached statistics.
@@ -140,8 +141,7 @@
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
-	ASSERT(error == 0);
-	return (error);
+	return (0);
 }
 
 static void
@@ -418,7 +418,7 @@
 
 	ddt_stat_generate(ddt, dde, &dds);
 
-	bucket = highbit(dds.dds_ref_blocks) - 1;
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
 	ASSERT(bucket >= 0);
 
 	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -596,7 +596,10 @@
 		bcopy(src, dst, s_len);
 	}
 
-	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+	*version = cpfunc;
+	/* CONSTCOND */
+	if (ZFS_HOST_BYTEORDER)
+		*version |= DDT_COMPRESS_BYTEORDER_MASK;
 
 	return (c_len + 1);
 }
@@ -613,7 +616,8 @@
 	else
 		bcopy(src, dst, d_len);
 
-	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+	    (ZFS_HOST_BYTEORDER != 0))
 		byteswap_uint64_array(dst, d_len);
 }
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,10 +21,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
-
 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
+/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
@@ -45,7 +46,9 @@
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
+#include <sys/vm.h>
 #include <sys/zfs_znode.h>
 #endif
 
@@ -129,41 +132,99 @@
 };
 
 int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
 {
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	blkid = dbuf_whichblock(dn, 0, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
-	int db_flags = DB_RF_CANFAIL;
 
-	if (flags & DMU_READ_NO_PREFETCH)
-		db_flags |= DB_RF_NOPREFETCH;
-
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+
 	if (db == NULL) {
-		err = SET_ERROR(EIO);
-	} else {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
-		if (err) {
+		if (err != 0) {
 			dbuf_rele(db, tag);
-			db = NULL;
+			*dbp = NULL;
 		}
 	}
 
-	dnode_rele(dn, FTAG);
-	*dbp = &db->db; /* NULL db plus first field offset is NULL */
 	return (err);
 }
 
 int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
 dmu_bonus_max(void)
 {
 	return (DN_MAX_BONUSLEN);
@@ -271,7 +332,7 @@
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
-		(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
@@ -373,27 +434,29 @@
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
-    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
-	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
-	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
-	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
-	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
-		dbuf_flags |= DB_RF_NOPREFETCH;
+	/*
+	 * Note: We directly notify the prefetch code of this read, so that
+	 * we can tell it about the multi-block read.  dbuf_read() only knows
+	 * about the one block it is accessing.
+	 */
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+	    DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
-		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
@@ -409,13 +472,10 @@
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
-	if (dn->dn_objset->os_dsl_dataset)
-		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-	start = gethrtime();
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
@@ -422,6 +482,7 @@
 			zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
+
 		/* initiate async i/o */
 		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
@@ -431,13 +492,16 @@
 #endif
 		dbp[i] = &db->db;
 	}
+
+	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+		    read && DNODE_IS_CACHEABLE(dn));
+	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
-	/* track read overhead when we are in sync context */
-	if (dp && dsl_pool_sync_context(dp))
-		dp->dp_read_overhead += gethrtime() - start;
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
@@ -487,7 +551,8 @@
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+    uint64_t length, boolean_t read, void *tag, int *numbufsp,
+    dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
@@ -519,16 +584,22 @@
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
+/*
+ * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
+ */
 void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
-	int nblks, i, err;
+	int nblks, err;
 
-	if (zfs_prefetch_disable)
-		return;
-
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = DMU_META_DNODE(os);
 
@@ -536,8 +607,9 @@
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid);
+		blkid = dbuf_whichblock(dn, level,
+		    object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
@@ -552,18 +624,24 @@
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+	/*
+	 * offset + len - 1 is the last byte we want to prefetch for, and offset
+	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
+	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
+	 * offset)  is the first.  Then the number we need to prefetch is the
+	 * last - first + 1.
+	 */
+	if (level > 0 || dn->dn_datablkshift != 0) {
+		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, offset);
-		for (i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, blkid+i);
+		blkid = dbuf_whichblock(dn, level, offset);
+		for (int i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
@@ -576,98 +654,99 @@
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length").  On return, *start will be the first
+ * offset that should be freed.
  */
 static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 {
-	uint64_t len = *start - limit;
-	uint64_t blkcnt = 0;
-	uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
-	ASSERT(limit <= *start);
+	ASSERT3U(minimum, <=, *start);
 
-	if (len <= iblkrange * maxblks) {
-		*start = limit;
+	if (*start - minimum <= iblkrange * maxblks) {
+		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
-	while (*start > limit && blkcnt < maxblks) {
+	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
-		/* find next allocated L1 indirect */
+		/*
+		 * dnode_next_offset(BACKWARDS) will find an allocated L1
+		 * indirect block at or before the input offset.  We must
+		 * decrement *start so that it is at the end of the region
+		 * to search.
+		 */
+		(*start)--;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
-		/* if there are no more, then we are done */
+		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
-			*start = limit;
-			return (0);
-		} else if (err) {
+			*start = minimum;
+			break;
+		} else if (err != 0) {
 			return (err);
 		}
-		blkcnt += 1;
 
-		/* reset offset to end of "next" block back */
+		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
-		if (*start <= limit)
-			*start = limit;
-		else
-			*start -= 1;
 	}
+	if (*start < minimum)
+		*start = minimum;
 	return (0);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
-    uint64_t length, boolean_t free_dnode)
+    uint64_t length)
 {
-	dmu_tx_t *tx;
-	uint64_t object_size, start, end, len;
-	boolean_t trunc = (length == DMU_OBJECT_END);
-	int align, err;
+	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	int err;
 
-	align = 1 << dn->dn_datablkshift;
-	ASSERT(align > 0);
-	object_size = align == 1 ? dn->dn_datablksz :
-	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
-
-	end = offset + length;
-	if (trunc || end > object_size)
-		end = object_size;
-	if (end <= offset)
+	if (offset >= object_size)
 		return (0);
-	length = end - offset;
 
-	while (length) {
-		start = end;
-		/* assert(offset <= start) */
-		err = get_next_chunk(dn, &start, offset);
+	if (length == DMU_OBJECT_END || offset + length > object_size)
+		length = object_size - offset;
+
+	while (length != 0) {
+		uint64_t chunk_end, chunk_begin;
+
+		chunk_end = chunk_begin = offset + length;
+
+		/* move chunk_begin backwards to the beginning of this chunk */
+		err = get_next_chunk(dn, &chunk_begin, offset);
 		if (err)
 			return (err);
-		len = trunc ? DMU_OBJECT_END : end - start;
+		ASSERT3U(chunk_begin, >=, offset);
+		ASSERT3U(chunk_begin, <=, chunk_end);
 
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, dn->dn_object, start, len);
+		dmu_tx_t *tx = dmu_tx_create(os);
+		dmu_tx_hold_free(tx, dn->dn_object,
+		    chunk_begin, chunk_end - chunk_begin);
+
+		/*
+		 * Mark this transaction as typically resulting in a net
+		 * reduction in space used.
+		 */
+		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
+		dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
+		dmu_tx_commit(tx);
 
-		dnode_free_range(dn, start, trunc ? -1 : len, tx);
-
-		if (start == 0 && free_dnode) {
-			ASSERT(trunc);
-			dnode_free(dn, tx);
-		}
-
-		length -= end - start;
-
-		dmu_tx_commit(tx);
-		end = start;
+		length -= chunk_end - chunk_begin;
 	}
 	return (0);
 }
@@ -682,38 +761,43 @@
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+	err = dmu_free_long_range_impl(os, dn, offset, length);
+
+	/*
+	 * It is important to zero out the maxblkid when freeing the entire
+	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+	 * will take the fast path, and (b) dnode_reallocate() can verify
+	 * that the entire file has been freed.
+	 */
+	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+		dn->dn_maxblkid = 0;
+
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
-dmu_free_object(objset_t *os, uint64_t object)
+dmu_free_long_object(objset_t *os, uint64_t object)
 {
-	dnode_t *dn;
 	dmu_tx_t *tx;
 	int err;
 
-	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
-	    FTAG, &dn);
+	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
-	if (dn->dn_nlevels == 1) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, object);
-		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err == 0) {
-			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
-			dnode_free(dn, tx);
-			dmu_tx_commit(tx);
-		} else {
-			dmu_tx_abort(tx);
-		}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, object);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+	dmu_tx_mark_netfree(tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == 0) {
+		err = dmu_object_free(os, object, tx);
+		dmu_tx_commit(tx);
 	} else {
-		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+		dmu_tx_abort(tx);
 	}
-	dnode_rele(dn, FTAG);
+
 	return (err);
 }
 
@@ -854,6 +938,25 @@
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+
+	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+	VERIFY0(dmu_buf_hold_noread(os, object, offset,
+	    FTAG, &db));
+
+	dmu_buf_write_embedded(db,
+	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+	    uncompressed_size, compressed_size, byteorder, tx);
+
+	dmu_buf_rele(db, FTAG);
+}
+
 /*
  * DMU support for xuio
  */
@@ -978,8 +1081,8 @@
 }
 
 #ifdef _KERNEL
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+static int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
@@ -989,8 +1092,8 @@
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
-	    &numbufs, &dbp);
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
@@ -1024,8 +1127,13 @@
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else {
+#ifdef illumos
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
+#else
+			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
+			    tocpy, uio);
+#endif
 		}
 		if (err)
 			break;
@@ -1037,6 +1145,58 @@
 	return (err);
 }
 
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_read_uio_dnode(dn, uio, size);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_read_uio_dnode(dn, uio, size);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
 static int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
@@ -1067,6 +1227,7 @@
 		else
 			dmu_buf_will_dirty(db, tx);
 
+#ifdef illumos
 		/*
 		 * XXX uiomove could block forever (eg. nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
@@ -1075,6 +1236,10 @@
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
+#else
+		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
+		    uio);
+#endif
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
@@ -1089,6 +1254,15 @@
 	return (err);
 }
 
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
@@ -1108,6 +1282,11 @@
 	return (err);
 }
 
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
@@ -1129,7 +1308,7 @@
 	return (err);
 }
 
-#ifdef sun
+#ifdef illumos
 int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     page_t *pp, dmu_tx_t *tx)
@@ -1184,9 +1363,67 @@
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
-#endif	/* sun */
-#endif
 
+#else	/* !illumos */
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    vm_page_t *ma, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	struct sf_buf *sf;
+	int numbufs, i;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy, copied, thiscpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+		caddr_t va;
+
+		ASSERT(size > 0);
+		ASSERT3U(db->db_size, >=, PAGESIZE);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
+			thiscpy = MIN(PAGESIZE, tocpy - copied);
+			va = zfs_map_page(*ma, &sf);
+			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+			zfs_unmap_page(sf);
+			ma += 1;
+			bufoff += PAGESIZE;
+		}
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+#endif	/* illumos */
+#endif	/* _KERNEL */
+
 /*
  * Allocate a loaned anonymous arc buffer.
  */
@@ -1194,10 +1431,8 @@
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
-	spa_t *spa;
 
-	DB_GET_SPA(&spa, db);
-	return (arc_loan_buf(spa, size));
+	return (arc_loan_buf(db->db_objset->os_spa, size));
 }
 
 /*
@@ -1207,7 +1442,7 @@
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
-	VERIFY(arc_buf_remove_ref(buf, FTAG));
+	arc_buf_destroy(buf, FTAG);
 }
 
 /*
@@ -1228,12 +1463,22 @@
 	DB_DNODE_ENTER(dbuf);
 	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(dbuf);
 
-	if (offset == db->db.db_offset && blksz == db->db.db_size) {
+	/*
+	 * We can only assign if the offset is aligned, the arc buf is the
+	 * same size as the dbuf, and the dbuf is not metadata.  It
+	 * can't be metadata because the loaned arc buf comes from the
+	 * user-data kmem arena.
+	 */
+	if (offset == db->db.db_offset && blksz == db->db.db_size &&
+	    DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+#ifdef _KERNEL
+		curthread->td_ru.ru_oublock++;
+#endif
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
@@ -1275,7 +1520,7 @@
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
-		} else {
+		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			bp->blk_fill = 1;
 		}
@@ -1307,12 +1552,25 @@
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
-			ASSERT(zio_checksum_table[chksum].ci_dedup);
+			ASSERT(zio_checksum_table[chksum].ci_flags &
+			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
-		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+
+		/*
+		 * Old style holes are filled with all zeros, whereas
+		 * new-style holes maintain their lsize, type, level,
+		 * and birth time (see zio_write_compress). While we
+		 * need to reset the BP_SET_LSIZE() call that happened
+		 * in dmu_sync_ready for old style holes, we do *not*
+		 * want to wipe out the information contained in new
+		 * style holes. Thus, only zero out the block pointer if
+		 * it's an old style hole.
+		 */
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1357,7 +1615,7 @@
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
-    zio_prop_t *zp, zbookmark_t *zb)
+    zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
@@ -1376,10 +1634,11 @@
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
-	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
-	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
+	    zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
+	    zp, dmu_sync_late_arrival_ready, NULL,
+	    NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
@@ -1418,7 +1677,7 @@
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *dsa;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
@@ -1480,19 +1739,32 @@
 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
 	/*
-	 * Assume the on-disk data is X, the current syncing data is Y,
-	 * and the current in-memory data is Z (currently in dmu_sync).
-	 * X and Z are identical but Y is has been modified. Normally,
-	 * when X and Z are the same we will perform a nopwrite but if Y
-	 * is different we must disable nopwrite since the resulting write
-	 * of Y to disk can free the block containing X. If we allowed a
-	 * nopwrite to occur the block pointing to Z would reference a freed
-	 * block. Since this is a rare case we simplify this by disabling
-	 * nopwrite if the current dmu_sync-ing dbuf has been modified in
-	 * a previous transaction.
+	 * Assume the on-disk data is X, the current syncing data (in
+	 * txg - 1) is Y, and the current in-memory data is Z (currently
+	 * in dmu_sync).
+	 *
+	 * We usually want to perform a nopwrite if X and Z are the
+	 * same.  However, if Y is different (i.e. the BP is going to
+	 * change before this write takes effect), then a nopwrite will
+	 * be incorrect - we would override with X, which could have
+	 * been freed when Y was written.
+	 *
+	 * (Note that this is not a concern when we are nop-writing from
+	 * syncing context, because X and Y must be identical, because
+	 * all previous txgs have been synced.)
+	 *
+	 * Therefore, we disable nopwrite if the current BP could change
+	 * before this TXG.  There are two ways it could change: by
+	 * being dirty (dr_next is non-NULL), or by being freed
+	 * (dnode_block_freed()).  This behavior is verified by
+	 * zio_done(), which VERIFYs that the override BP is identical
+	 * to the on-disk BP.
 	 */
-	if (dr->dr_next)
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
+	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
@@ -1518,8 +1790,8 @@
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
-	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
@@ -1526,7 +1798,7 @@
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
@@ -1541,13 +1813,19 @@
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os, object, FTAG, &dn);
-	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	/*
+	 * Send streams include each object's checksum function.  This
+	 * check ensures that the receiving system can understand the
+	 * checksum function transmitted.
+	 */
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
@@ -1555,13 +1833,18 @@
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os, object, FTAG, &dn);
-	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+	/*
+	 * Send streams include each object's compression function.  This
+	 * check ensures that the receiving system can understand the
+	 * compression function transmitted.
+	 */
+	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
@@ -1572,6 +1855,12 @@
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
     &zfs_mdcomp_disable, 0, "Disable metadata compression");
 
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
@@ -1594,12 +1883,16 @@
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
-		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
-		 */
-		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
+		if (zfs_mdcomp_disable) {
+			compress = ZIO_COMPRESS_EMPTY;
+		} else {
+			/*
+			 * XXX -- we should design a compression algorithm
+			 * that specializes in arrays of bps.
+			 */
+			compress = zio_compress_select(os->os_spa,
+			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+		}
 
 		/*
 		 * Metadata always gets checksummed.  If the data
@@ -1608,9 +1901,18 @@
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
-		if (zio_checksum_table[checksum].ci_correctable < 1 ||
-		    zio_checksum_table[checksum].ci_eck)
+		if (!(zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_METADATA) ||
+		    (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    (level >= zfs_redundant_metadata_most_ditto_level ||
+		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+			copies++;
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
@@ -1622,9 +1924,10 @@
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
-		checksum = ZIO_CHECKSUM_OFF;
+		checksum = ZIO_CHECKSUM_NOPARITY;
 	} else {
-		compress = zio_compress_select(dn->dn_compress, compress);
+		compress = zio_compress_select(os->os_spa, dn->dn_compress,
+		    compress);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
@@ -1640,17 +1943,20 @@
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
-			if (!zio_checksum_table[checksum].ci_dedup)
+			if (!(zio_checksum_table[checksum].ci_flags &
+			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
-		 * Enable nopwrite if we have a cryptographically secure
-		 * checksum that has no known collisions (i.e. SHA-256)
-		 * and compression is enabled.  We don't enable nopwrite if
-		 * dedup is enabled as the two features are mutually exclusive.
+		 * Enable nopwrite if we have secure enough checksum
+		 * algorithm (see comment in zio_nop_write) and
+		 * compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually
+		 * exclusive.
 		 */
-		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
@@ -1658,7 +1964,7 @@
 	zp->zp_compress = compress;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
-	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
@@ -1668,25 +1974,20 @@
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
-	int i, err;
+	int err;
 
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (list_link_active(&dn->dn_dirty_link[i]))
-			break;
+	err = dmu_object_wait_synced(os, object);
+	if (err) {
+		return (err);
 	}
-	if (i != TXG_SIZE) {
-		dnode_rele(dn, FTAG);
-		txg_wait_synced(dmu_objset_pool(os), 0);
-		err = dnode_hold(os, object, FTAG, &dn);
-		if (err)
-			return (err);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err) {
+		return (err);
 	}
 
 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
@@ -1695,6 +1996,37 @@
 	return (err);
 }
 
+/*
+ * Given the ZFS object, if it contains any dirty nodes
+ * this function flushes all dirty blocks to disk. This
+ * ensures the DMU object info is updated. A more efficient
+ * future version might just find the TXG with the maximum
+ * ID and wait for that to be synced.
+ */
+int
+dmu_object_wait_synced(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	int error, i;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	if (error) {
+		return (error);
+	}
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (list_link_active(&dn->dn_dirty_link[i])) {
+			break;
+		}
+	}
+	dnode_rele(dn, FTAG);
+	if (i != TXG_SIZE) {
+		txg_wait_synced(dmu_objset_pool(os), 0);
+	}
+
+	return (0);
+}
+
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
@@ -1714,11 +2046,12 @@
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
+	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
@@ -1831,10 +2164,11 @@
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
-	dbuf_init();
 	zfetch_init();
+	zio_compress_init();
 	l2arc_init();
 	arc_init();
+	dbuf_init();
 }
 
 void
@@ -1843,6 +2177,7 @@
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	zfetch_fini();
+	zio_compress_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -131,7 +131,7 @@
 /* ARGSUSED */
 static int
 diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct diffarg *da = arg;
 	int err = 0;
@@ -139,10 +139,10 @@
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (SET_ERROR(EINTR));
 
-	if (zb->zb_object != DMU_META_DNODE_OBJECT)
+	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
 		return (0);
 
-	if (bp == NULL) {
+	if (BP_IS_HOLE(bp)) {
 		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 
@@ -153,7 +153,7 @@
 	} else if (zb->zb_level == 0) {
 		dnode_phys_t *blk;
 		arc_buf_t *abuf;
-		uint32_t aflags = ARC_WAIT;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		int blksz = BP_GET_LSIZE(bp);
 		int i;
 
@@ -170,7 +170,7 @@
 			if (err)
 				break;
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 		if (err)
 			return (err);
 		/* Don't care about the data blocks */
@@ -215,7 +215,7 @@
 		return (error);
 	}
 
-	if (!dsl_dataset_is_before(tosnap, fromsnap)) {
+	if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
 		dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
@@ -222,7 +222,7 @@
 		return (SET_ERROR(EXDEV));
 	}
 
-	fromtxg = fromsnap->ds_phys->ds_creation_txg;
+	fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
 	dsl_dataset_rele(fromsnap, FTAG);
 
 	dsl_dataset_long_hold(tosnap, FTAG);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -28,6 +29,8 @@
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
 
 uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
@@ -48,6 +51,12 @@
 		 * reasonably sparse (at most 1/4 full).  Look from the
 		 * beginning once, but after that keep looking from here.
 		 * If we can't find one, just keep going from here.
+		 *
+		 * Note that dmu_traverse depends on the behavior that we use
+		 * multiple blocks of the dnode object before going back to
+		 * reuse objects.  Any change to this algorithm should preserve
+		 * that property or find another solution to the issues
+		 * described in traverse_visitbp.
 		 */
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
@@ -106,11 +115,9 @@
 
 int
 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen)
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
-	dmu_tx_t *tx;
-	int nblkptr;
 	int err;
 
 	if (object == DMU_META_DNODE_OBJECT)
@@ -121,44 +128,9 @@
 	if (err)
 		return (err);
 
-	if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
-	    dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
-		/* nothing is changing, this is a noop */
-		dnode_rele(dn, FTAG);
-		return (0);
-	}
-
-	if (bonustype == DMU_OT_SA) {
-		nblkptr = 1;
-	} else {
-		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
-	}
-
-	/*
-	 * If we are losing blkptrs or changing the block size this must
-	 * be a new file instance.   We must clear out the previous file
-	 * contents before we can change this type of metadata in the dnode.
-	 */
-	if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
-		err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
-		if (err)
-			goto out;
-	}
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, object);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
 
-	dmu_tx_commit(tx);
-out:
 	dnode_rele(dn, FTAG);
-
 	return (err);
 }
 
@@ -183,6 +155,11 @@
 	return (0);
 }
 
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
 int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
@@ -196,3 +173,54 @@
 
 	return (error);
 }
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+		dnode_rele(dn, FTAG);
+		return;
+	}
+	ASSERT3U(dn->dn_type, ==, old_type);
+	ASSERT0(dn->dn_maxblkid);
+	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+	    DMU_OTN_ZAP_METADATA;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	mzap_create_impl(mos, object, 0, 0, tx);
+
+	spa_feature_incr(dmu_objset_spa(mos),
+	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	dmu_object_type_t t;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	t = dn->dn_type;
+	dnode_rele(dn, FTAG);
+
+	if (t == DMU_OTN_ZAP_METADATA) {
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+	}
+	VERIFY0(dmu_object_free(mos, object, tx));
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,13 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -47,6 +52,7 @@
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -54,6 +60,16 @@
  */
 krwlock_t os_lock;
 
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
 void
 dmu_objset_init(void)
 {
@@ -115,13 +131,13 @@
 	return (ds ? ds->ds_object : 0);
 }
 
-uint64_t
+zfs_sync_type_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
-uint64_t
+zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
@@ -150,7 +166,8 @@
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
-	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+	os->os_compress = zio_compress_select(os->os_spa, newval,
+	    ZIO_COMPRESS_ON);
 }
 
 static void
@@ -230,6 +247,20 @@
 }
 
 static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+	    newval == ZFS_REDUNDANT_METADATA_MOST);
+
+	os->os_redundant_metadata = newval;
+}
+
+static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
@@ -241,6 +272,14 @@
 		zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -271,15 +310,13 @@
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
-		uint32_t aflags = ARC_WAIT;
-		zbookmark_t zb;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		zbookmark_phys_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
-			aflags |= ARC_L2CACHE;
-		if (DMU_OS_IS_L2COMPRESSIBLE(os))
-			aflags |= ARC_L2COMPRESS;
+			aflags |= ARC_FLAG_L2CACHE;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
@@ -296,14 +333,13 @@
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
-			arc_buf_t *buf = arc_buf_alloc(spa,
+			arc_buf_t *buf = arc_alloc_buf(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
-			(void) arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf);
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
@@ -312,7 +348,7 @@
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
-		os->os_phys_buf = arc_buf_alloc(spa, size,
+		os->os_phys_buf = arc_alloc_buf(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
@@ -324,7 +360,18 @@
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
-	if (ds) {
+	if (ds != NULL) {
+		boolean_t needlock = B_FALSE;
+
+		/*
+		 * Note: it's valid to open the objset if the dataset is
+		 * long-held, in which case the pool_config lock will not
+		 * be held.
+		 */
+		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+			needlock = B_TRUE;
+			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		}
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
@@ -333,7 +380,7 @@
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
-		if (!dsl_dataset_is_snapshot(ds)) {
+		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
@@ -364,27 +411,39 @@
 				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
 			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(
+				    ZFS_PROP_REDUNDANT_METADATA),
+				    redundant_metadata_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    recordsize_changed_cb, os);
+			}
 		}
+		if (needlock)
+			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (err != 0) {
-			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf));
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
-	} else if (ds == NULL) {
+	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		os->os_compress = ZIO_COMPRESS_LZJB;
+		os->os_compress = ZIO_COMPRESS_ON;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
-		os->os_dedup_verify = 0;
-		os->os_logbias = 0;
-		os->os_sync = 0;
+		os->os_dedup_verify = B_FALSE;
+		os->os_logbias = ZFS_LOGBIAS_LATENCY;
+		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
-	if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
@@ -403,29 +462,15 @@
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	DMU_META_DNODE(os) = dnode_special_open(os,
-	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
-	    &os->os_meta_dnode);
+	dnode_special_open(os, &os->os_phys->os_meta_dnode,
+	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-		DMU_USERUSED_DNODE(os) = dnode_special_open(os,
-		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
-		    &os->os_userused_dnode);
-		DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
-		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
-		    &os->os_groupused_dnode);
+		dnode_special_open(os, &os->os_phys->os_userused_dnode,
+		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 	}
 
-	/*
-	 * We should be the only thread trying to do this because we
-	 * have ds_opening_lock
-	 */
-	if (ds) {
-		mutex_enter(&ds->ds_lock);
-		ASSERT(ds->ds_objset == NULL);
-		ds->ds_objset = os;
-		mutex_exit(&ds->ds_lock);
-	}
-
 	*osp = os;
 	return (0);
 }
@@ -435,12 +480,29 @@
 {
 	int err = 0;
 
+	/*
+	 * We shouldn't be doing anything with dsl_dataset_t's unless the
+	 * pool_config lock is held, or the dataset is long-held.
+	 */
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+	    dsl_dataset_long_held(ds));
+
 	mutex_enter(&ds->ds_opening_lock);
-	*osp = ds->ds_objset;
-	if (*osp == NULL) {
+	if (ds->ds_objset == NULL) {
+		objset_t *os;
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, dsl_dataset_get_blkptr(ds), osp);
+		    ds, dsl_dataset_get_blkptr(ds), &os);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+		if (err == 0) {
+			mutex_enter(&ds->ds_lock);
+			ASSERT(ds->ds_objset == NULL);
+			ds->ds_objset = os;
+			mutex_exit(&ds->ds_lock);
+		}
 	}
+	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
@@ -474,6 +536,25 @@
 	return (err);
 }
 
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	int err;
+
+	err = dmu_objset_from_ds(ds, osp);
+	if (err != 0) {
+		dsl_dataset_disown(ds, tag);
+	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EINVAL));
+	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EROFS));
+	}
+	return (err);
+}
+
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
@@ -495,21 +576,26 @@
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
+	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
+	dsl_pool_rele(dp, FTAG);
 
-	err = dmu_objset_from_ds(ds, osp);
-	dsl_pool_rele(dp, FTAG);
-	if (err != 0) {
-		dsl_dataset_disown(ds, tag);
-	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EINVAL));
-	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EROFS));
-	}
 	return (err);
 }
 
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+	if (err != 0)
+		return (err);
+
+	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
@@ -518,7 +604,37 @@
 	dsl_pool_rele(dp, tag);
 }
 
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag.  We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
 void
+dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
+    void *tag)
+{
+	dsl_pool_t *dp;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+
+	VERIFY3P(ds, !=, NULL);
+	VERIFY3P(ds->ds_owner, ==, tag);
+	VERIFY(dsl_dataset_long_held(ds));
+
+	dsl_dataset_name(ds, name);
+	dp = ds->ds_dir->dd_pool;
+	dsl_pool_config_enter(dp, FTAG);
+	dsl_dataset_disown(ds, tag);
+	VERIFY0(dsl_dataset_own(dp, name, tag, newds));
+	dsl_pool_config_exit(dp, FTAG);
+}
+
+void
 dmu_objset_disown(objset_t *os, void *tag)
 {
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
@@ -527,41 +643,53 @@
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
+	dnode_t dn_marker;
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
+	dn = list_head(&os->os_dnodes);
+	while (dn != NULL) {
+		/*
+		 * Skip dnodes without holds.  We have to do this dance
+		 * because dnode_add_ref() only works if there is already a
+		 * hold.  If the dnode has no holds, then it has no dbufs.
+		 */
+		if (dnode_add_ref(dn, FTAG)) {
+			list_insert_after(&os->os_dnodes, dn, &dn_marker);
+			mutex_exit(&os->os_lock);
 
-	/* process the mdn last, since the other dnodes have holds on it */
-	list_remove(&os->os_dnodes, DMU_META_DNODE(os));
-	list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
+			dnode_evict_dbufs(dn);
+			dnode_rele(dn, FTAG);
 
-	/*
-	 * Find the first dnode with holds.  We have to do this dance
-	 * because dnode_add_ref() only works if you already have a
-	 * hold.  If there are no holds then it has no dbufs so OK to
-	 * skip.
-	 */
-	for (dn = list_head(&os->os_dnodes);
-	    dn && !dnode_add_ref(dn, FTAG);
-	    dn = list_next(&os->os_dnodes, dn))
-		continue;
+			mutex_enter(&os->os_lock);
+			dn = list_next(&os->os_dnodes, &dn_marker);
+			list_remove(&os->os_dnodes, &dn_marker);
+		} else {
+			dn = list_next(&os->os_dnodes, dn);
+		}
+	}
+	mutex_exit(&os->os_lock);
 
-	while (dn) {
-		dnode_t *next_dn = dn;
-
-		do {
-			next_dn = list_next(&os->os_dnodes, next_dn);
-		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
-
-		mutex_exit(&os->os_lock);
-		dnode_evict_dbufs(dn);
-		dnode_rele(dn, FTAG);
-		mutex_enter(&os->os_lock);
-		dn = next_dn;
+	if (DMU_USERUSED_DNODE(os) != NULL) {
+		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 	}
-	mutex_exit(&os->os_lock);
+	dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
 void
 dmu_objset_evict(objset_t *os)
 {
@@ -570,34 +698,8 @@
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
-	if (ds) {
-		if (!dsl_dataset_is_snapshot(ds)) {
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-			    checksum_changed_cb, os));
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-			    compression_changed_cb, os));
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_COPIES),
-			    copies_changed_cb, os));
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP),
-			    dedup_changed_cb, os));
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
-			    logbias_changed_cb, os));
-			VERIFY0(dsl_prop_unregister(ds,
-			    zfs_prop_to_name(ZFS_PROP_SYNC),
-			    sync_changed_cb, os));
-		}
-		VERIFY0(dsl_prop_unregister(ds,
-		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
-		    primary_cache_changed_cb, os));
-		VERIFY0(dsl_prop_unregister(ds,
-		    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
-		    secondary_cache_changed_cb, os));
-	}
+	if (ds)
+		dsl_prop_unregister_all(ds, os);
 
 	if (os->os_sa)
 		sa_tear_down(os);
@@ -604,6 +706,21 @@
 
 	dmu_objset_evict_dbufs(os);
 
+	mutex_enter(&os->os_lock);
+	spa_evicting_os_register(os->os_spa, os);
+	if (list_is_empty(&os->os_dnodes)) {
+		mutex_exit(&os->os_lock);
+		dmu_objset_evict_done(os);
+	} else {
+		mutex_exit(&os->os_lock);
+	}
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		dnode_special_close(&os->os_userused_dnode);
@@ -611,10 +728,8 @@
 	}
 	zil_free(os->os_zil);
 
-	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 
-	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
-
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
@@ -627,6 +742,7 @@
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
+	spa_evicting_os_deregister(os->os_spa, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
@@ -672,11 +788,17 @@
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
-		 * to contain DN_MAX_OBJECT dnodes.
+		 * to contain DN_MAX_OBJECT dnodes.  Note that in order to
+		 * ensure that we do not overflow 64 bits, there has to be
+		 * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
+		 * but < 2^64.  Therefore,
+		 * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
+		 * less than (64 - log2(DN_MAX_OBJECT)) (16).
 		 */
-		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+		while ((uint64_t)mdn->dn_nblkptr <<
+		    (mdn->dn_datablkshift - DNODE_SHIFT +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
-		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
+		    DN_MAX_OBJECT)
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
@@ -720,6 +842,9 @@
 	if (strchr(doca->doca_name, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
+	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
@@ -727,9 +852,11 @@
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred);
 	dsl_dir_rele(pdd, FTAG);
 
-	return (0);
+	return (error);
 }
 
 static void
@@ -750,9 +877,11 @@
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	bp = dsl_dataset_get_blkptr(ds);
 	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 	    ds, bp, doca->doca_type, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	if (doca->doca_userfunc != NULL) {
 		doca->doca_userfunc(os, doca->doca_userarg,
@@ -778,7 +907,8 @@
 	doca.doca_type = type;
 
 	return (dsl_sync_task(name,
-	    dmu_objset_create_check, dmu_objset_create_sync, &doca, 5));
+	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
+	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 typedef struct dmu_objset_clone_arg {
@@ -801,6 +931,9 @@
 	if (strchr(doca->doca_clone, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
+	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
 	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
@@ -808,10 +941,12 @@
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
-	/* You can't clone across pools. */
-	if (pdd->dd_pool != dp) {
+
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred);
+	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
-		return (SET_ERROR(EXDEV));
+		return (SET_ERROR(EDQUOT));
 	}
 	dsl_dir_rele(pdd, FTAG);
 
@@ -819,14 +954,8 @@
 	if (error != 0)
 		return (error);
 
-	/* You can't clone across pools. */
-	if (origin->ds_dir->dd_pool != dp) {
-		dsl_dataset_rele(origin, FTAG);
-		return (SET_ERROR(EXDEV));
-	}
-
 	/* You can only clone snapshots, not the head datasets. */
-	if (!dsl_dataset_is_snapshot(origin)) {
+	if (!origin->ds_is_snapshot) {
 		dsl_dataset_rele(origin, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
@@ -844,7 +973,7 @@
 	const char *tail;
 	dsl_dataset_t *origin, *ds;
 	uint64_t obj;
-	char namebuf[MAXNAMELEN];
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
@@ -871,7 +1000,8 @@
 	doca.doca_cred = CRED();
 
 	return (dsl_sync_task(clone,
-	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5));
+	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
@@ -923,7 +1053,7 @@
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
-	ASSERT3P(bp, ==, os->os_rootbp);
+	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
 	ASSERT0(BP_GET_LEVEL(bp));
 
@@ -935,7 +1065,12 @@
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+	if (os->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+	*os->os_rootbp = *bp;
+	if (os->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 }
 
 /* ARGSUSED */
@@ -955,6 +1090,7 @@
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
+	kmem_free(bp, sizeof (*bp));
 }
 
 /* called from dsl */
@@ -962,12 +1098,14 @@
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
+	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+	*blkptr_copy = *os->os_rootbp;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
@@ -995,10 +1133,9 @@
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
-	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
-	    dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
-	    ZIO_FLAG_MUSTSUCCEED, &zb);
+	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
@@ -1069,18 +1206,83 @@
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
+typedef struct userquota_node {
+	uint64_t uqn_id;
+	int64_t uqn_delta;
+	avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+	avl_tree_t uqc_user_deltas;
+	avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+	const userquota_node_t *luqn = l;
+	const userquota_node_t *ruqn = r;
+
+	if (luqn->uqn_id < ruqn->uqn_id)
+		return (-1);
+	if (luqn->uqn_id > ruqn->uqn_id)
+		return (1);
+	return (0);
+}
+
 static void
-do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
-    uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
 {
+	void *cookie;
+	userquota_node_t *uqn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_user_deltas);
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
+{
+	userquota_node_t search = { .uqn_id = id };
+	avl_index_t idx;
+
+	userquota_node_t *uqn = avl_find(avl, &search, &idx);
+	if (uqn == NULL) {
+		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+		uqn->uqn_id = id;
+		avl_insert(avl, uqn, idx);
+	}
+	uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract)
+{
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
-		    user, delta, tx));
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
-		    group, delta, tx));
+
+		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
+		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
 	}
 }
 
@@ -1089,9 +1291,15 @@
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
+	userquota_cache_t cache = { 0 };
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
+	avl_create(&cache.uqc_user_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+	avl_create(&cache.uqc_group_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
 	while (dn = list_head(list)) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
@@ -1101,32 +1309,26 @@
 
 		/* Allocate the user/groupused objects if necessary. */
 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
-			VERIFY(0 == zap_create_claim(os,
+			VERIFY0(zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-			VERIFY(0 == zap_create_claim(os,
+			VERIFY0(zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
-		/*
-		 * We intentionally modify the zap object even if the
-		 * net delta is zero.  Otherwise
-		 * the block of the zap obj could be shared between
-		 * datasets but need to be different between them after
-		 * a bprewrite.
-		 */
-
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
-			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
-			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+			do_userquota_update(&cache,
+			    dn->dn_oldused, dn->dn_oldflags,
+			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
-			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+			do_userquota_update(&cache,
+			    DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
-			    dn->dn_newgid, B_FALSE, tx);
+			    dn->dn_newgid, B_FALSE);
 		}
 
 		mutex_enter(&dn->dn_mtx);
@@ -1147,6 +1349,7 @@
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
+	do_userquota_cacheflush(os, &cache, tx);
 }
 
 /*
@@ -1388,7 +1591,7 @@
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
-		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+		return (os->os_dsl_dataset->ds_is_snapshot);
 	else
 		return (B_FALSE);
 }
@@ -1400,12 +1603,12 @@
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
-	    real, maxlen, conflict));
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+	    MT_FIRST, real, maxlen, conflict));
 }
 
 int
@@ -1418,12 +1621,12 @@
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, *offp);
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
@@ -1457,12 +1660,12 @@
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
-	    dd->dd_phys->dd_head_dataset_obj)
+	    dsl_dir_phys(dd)->dd_head_dataset_obj)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
-	    dd->dd_phys->dd_child_dir_zapobj, *offp);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
@@ -1484,41 +1687,52 @@
 	return (0);
 }
 
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
-    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+	taskq_t		*dc_tq;
+	dsl_pool_t	*dc_dp;
+	uint64_t	dc_ddobj;
+	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+	void		*dc_arg;
+	int		dc_flags;
+	kmutex_t	*dc_error_lock;
+	int		*dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
+	dsl_pool_t *dp = dcp->dc_dp;
+	dmu_objset_find_ctx_t *child_dcp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
-	int err;
+	int err = 0;
 
-	ASSERT(dsl_pool_config_held(dp));
+	/* don't process if there already was an error */
+	if (*dcp->dc_error != 0)
+		goto out;
 
-	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
 	if (err != 0)
-		return (err);
+		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
-		return (0);
+		goto out;
 	}
 
-	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
-	if (flags & DS_FIND_CHILDREN) {
+	if (dcp->dc_flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
@@ -1525,29 +1739,29 @@
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
-			err = dmu_objset_find_dp(dp, attr->za_first_integer,
-			    func, arg, flags);
-			if (err != 0)
-				break;
+			child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+			*child_dcp = *dcp;
+			child_dcp->dc_ddobj = attr->za_first_integer;
+			if (dcp->dc_tq != NULL)
+				(void) taskq_dispatch(dcp->dc_tq,
+				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+			else
+				dmu_objset_find_dp_impl(child_dcp);
 		}
 		zap_cursor_fini(&zc);
-
-		if (err != 0) {
-			dsl_dir_rele(dd, FTAG);
-			kmem_free(attr, sizeof (zap_attribute_t));
-			return (err);
-		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
-	if (flags & DS_FIND_SNAPSHOTS) {
+	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
-			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
@@ -1561,7 +1775,7 @@
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
-				err = func(dp, ds, arg);
+				err = dcp->dc_func(dp, ds, dcp->dc_arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
@@ -1574,7 +1788,7 @@
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0)
-		return (err);
+		goto out;
 
 	/*
 	 * Apply to self.
@@ -1581,13 +1795,122 @@
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 	if (err != 0)
-		return (err);
-	err = func(dp, ds, arg);
+		goto out;
+	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
-	return (err);
+
+out:
+	if (err != 0) {
+		mutex_enter(dcp->dc_error_lock);
+		/* only keep first error */
+		if (*dcp->dc_error == 0)
+			*dcp->dc_error = err;
+		mutex_exit(dcp->dc_error_lock);
+	}
+
+	kmem_free(dcp, sizeof (*dcp));
 }
 
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+	dmu_objset_find_ctx_t *dcp = arg;
+	dsl_pool_t *dp = dcp->dc_dp;
+
+	/*
+	 * We need to get a pool_config_lock here, as there are several
+	 * asssert(pool_config_held) down the stack. Getting a lock via
+	 * dsl_pool_config_enter is risky, as it might be stalled by a
+	 * pending writer. This would deadlock, as the write lock can
+	 * only be granted when our parent thread gives up the lock.
+	 * The _prio interface gives us priority over a pending writer.
+	 */
+	dsl_pool_config_enter_prio(dp, FTAG);
+
+	dmu_objset_find_dp_impl(dcp);
+
+	dsl_pool_config_exit(dp, FTAG);
+}
+
 /*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+	int error = 0;
+	taskq_t *tq = NULL;
+	int ntasks;
+	dmu_objset_find_ctx_t *dcp;
+	kmutex_t err_lock;
+
+	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+	dcp->dc_tq = NULL;
+	dcp->dc_dp = dp;
+	dcp->dc_ddobj = ddobj;
+	dcp->dc_func = func;
+	dcp->dc_arg = arg;
+	dcp->dc_flags = flags;
+	dcp->dc_error_lock = &err_lock;
+	dcp->dc_error = &error;
+
+	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+		/*
+		 * In case a write lock is held we can't make use of
+		 * parallelism, as down the stack of the worker threads
+		 * the lock is asserted via dsl_pool_config_held.
+		 * In case of a read lock this is solved by getting a read
+		 * lock in each worker thread, which isn't possible in case
+		 * of a writer lock. So we fall back to the synchronous path
+		 * here.
+		 * In the future it might be possible to get some magic into
+		 * dsl_pool_config_held in a way that it returns true for
+		 * the worker threads so that a single lock held from this
+		 * thread suffices. For now, stay single threaded.
+		 */
+		dmu_objset_find_dp_impl(dcp);
+		mutex_destroy(&err_lock);
+
+		return (error);
+	}
+
+	ntasks = dmu_find_threads;
+	if (ntasks == 0)
+		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+	    INT_MAX, 0);
+	if (tq == NULL) {
+		kmem_free(dcp, sizeof (*dcp));
+		mutex_destroy(&err_lock);
+
+		return (SET_ERROR(ENOMEM));
+	}
+	dcp->dc_tq = tq;
+
+	/* dcp will be freed by task */
+	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+	/*
+	 * PORTING: this code relies on the property of taskq_wait to wait
+	 * until no more tasks are queued and no more tasks are active. As
+	 * we always queue new tasks from within other tasks, task_wait
+	 * reliably waits for the full recursion to finish, even though we
+	 * enqueue new tasks after taskq_wait has been called.
+	 * On platforms other than illumos, taskq_wait may not have this
+	 * property.
+	 */
+	taskq_wait(tq);
+	taskq_destroy(tq);
+	mutex_destroy(&err_lock);
+
+	return (error);
+}
+
+/*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it
  * will not be held when the callback is called.
@@ -1622,7 +1945,7 @@
 		return (0);
 	}
 
-	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
@@ -1630,7 +1953,7 @@
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
@@ -1663,7 +1986,9 @@
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
-			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
@@ -1731,7 +2056,7 @@
 
 /*
  * Determine name of filesystem, given name of snapshot.
- * buf must be at least MAXNAMELEN bytes
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
  */
 int
 dmu_fsname(const char *snapname, char *buf)
@@ -1739,7 +2064,7 @@
 	char *atp = strchr(snapname, '@');
 	if (atp == NULL)
 		return (SET_ERROR(EINVAL));
-	if (atp - snapname >= MAXNAMELEN)
+	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,12 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
@@ -50,22 +53,76 @@
 #include <sys/zfs_onexit.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
 
+#ifdef __FreeBSD__
+#undef dump_write
+#define dump_write dmu_dump_write
+#endif
+
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
 
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
+#endif
+
 static char *dmu_recv_tag = "dmu_recv_tag";
-static const char *recv_clone_name = "%recv";
+const char *recv_clone_name = "%recv";
 
+#define	BP_SPAN(datablkszsec, indblkshift, level) \
+	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+static void byteswap_record(dmu_replay_record_t *drr);
+
+struct send_thread_arg {
+	bqueue_t	q;
+	dsl_dataset_t	*ds;		/* Dataset to traverse */
+	uint64_t	fromtxg;	/* Traverse from this txg */
+	int		flags;		/* flags to pass to traverse_dataset */
+	int		error_code;
+	boolean_t	cancel;
+	zbookmark_phys_t resume;
+};
+
+struct send_block_record {
+	boolean_t		eos_marker; /* Marks the end of the stream */
+	blkptr_t		bp;
+	zbookmark_phys_t	zb;
+	uint8_t			indblkshift;
+	uint16_t		datablkszsec;
+	bqueue_node_t		ln;
+};
+
 static int
 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 {
-	dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
 	struct uio auio;
 	struct iovec aiov;
+
+	/*
+	 * The code does not rely on this (len being a multiple of 8).  We keep
+	 * this assertion because of the corresponding assertion in
+	 * receive_read().  Keeping this assertion ensures that we do not
+	 * inadvertently break backwards compatibility (causing the assertion
+	 * in receive_read() to trigger on old software).
+	 *
+	 * Removing the assertions could be rolled into a new feature that uses
+	 * data that isn't 8-byte aligned; if the assertions were removed, a
+	 * feature flag would have to be added.
+	 */
+
 	ASSERT0(len % 8);
 
-	fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
@@ -91,12 +148,74 @@
 	return (dsp->dsa_err);
 }
 
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
 static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	fletcher_4_incremental_native(dsp->dsa_drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &dsp->dsa_zc);
+	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
+		dsp->dsa_sent_begin = B_TRUE;
+	} else {
+		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+		    drr_checksum.drr_checksum));
+		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+	}
+	if (dsp->dsa_drr->drr_type == DRR_END) {
+		dsp->dsa_sent_end = B_TRUE;
+	}
+	fletcher_4_incremental_native(&dsp->dsa_drr->
+	    drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), &dsp->dsa_zc);
+	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+		return (SET_ERROR(EINTR));
+	if (payload_len != 0) {
+		fletcher_4_incremental_native(payload, payload_len,
+		    &dsp->dsa_zc);
+		if (dump_bytes(dsp, payload, payload_len) != 0)
+			return (SET_ERROR(EINTR));
+	}
+	return (0);
+}
+
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
+static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
 
+	/*
+	 * When we receive a free record, dbuf_free_range() assumes
+	 * that the receiving system doesn't have any dbufs in the range
+	 * being freed.  This is always true because there is a one-record
+	 * constraint: we only send one WRITE record for any given
+	 * object,offset.  We know that the one-record constraint is
+	 * true because we always send data in increasing order by
+	 * object,offset.
+	 *
+	 * If the increasing-order constraint ever changes, we should find
+	 * another way to assert that the one-record constraint is still
+	 * satisfied.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+
 	if (length != -1ULL && offset + length < offset)
 		length = -1ULL;
 
@@ -109,8 +228,7 @@
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREE) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
@@ -133,8 +251,7 @@
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
-			if (dump_bytes(dsp, dsp->dsa_drr,
-			    sizeof (dmu_replay_record_t)) != 0)
+			if (dump_record(dsp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
@@ -147,8 +264,7 @@
 	drrf->drr_length = length;
 	drrf->drr_toguid = dsp->dsa_toguid;
 	if (length == -1ULL) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 	} else {
 		dsp->dsa_pending_op = PENDING_FREE;
@@ -158,11 +274,20 @@
 }
 
 static int
-dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
 	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 
+	/*
+	 * We send data in increasing object, offset order.
+	 * See comment in dump_free() for details.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+	dsp->dsa_last_data_object = object;
+	dsp->dsa_last_data_offset = offset + blksz - 1;
 
 	/*
 	 * If there is any kind of pending aggregation (currently either
@@ -171,12 +296,11 @@
 	 * of different types.
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
-	/* write a DATA record */
+	/* write a WRITE record */
 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 	dsp->dsa_drr->drr_type = DRR_WRITE;
 	drrw->drr_object = object;
@@ -184,29 +308,71 @@
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
-	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
-	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
-		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
-	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
-	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
-	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
-	drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+		/*
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
+		 */
+		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+	} else {
+		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)
+			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+		drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	}
 
-	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+	if (dump_record(dsp, data, blksz) != 0)
 		return (SET_ERROR(EINTR));
-	if (dump_bytes(dsp, data, blksz) != 0)
-		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+    int blksz, const blkptr_t *bp)
+{
+	char buf[BPE_PAYLOAD_SIZE];
+	struct drr_write_embedded *drrw =
+	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (EINTR);
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+	drrw->drr_object = object;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = dsp->dsa_toguid;
+	drrw->drr_compression = BP_GET_COMPRESS(bp);
+	drrw->drr_etype = BPE_GET_ETYPE(bp);
+	drrw->drr_lsize = BPE_GET_LSIZE(bp);
+	drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+	decode_embedded_bp_compressed(bp, buf);
+
+	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+		return (EINTR);
+	return (0);
+}
+
+static int
 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 {
 	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
@@ -218,10 +384,8 @@
 	drrs->drr_length = blksz;
 	drrs->drr_toguid = dsp->dsa_toguid;
 
-	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
+	if (dump_record(dsp, data, blksz) != 0)
 		return (SET_ERROR(EINTR));
-	if (dump_bytes(dsp, data, blksz))
-		return (SET_ERROR(EINTR));
 	return (0);
 }
 
@@ -239,8 +403,7 @@
 	 */
 	if (dsp->dsa_pending_op != PENDING_NONE &&
 	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
@@ -254,8 +417,7 @@
 			return (0);
 		} else {
 			/* can't be aggregated.  Push out pending record */
-			if (dump_bytes(dsp, dsp->dsa_drr,
-			    sizeof (dmu_replay_record_t)) != 0)
+			if (dump_record(dsp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dsp->dsa_pending_op = PENDING_NONE;
 		}
@@ -278,12 +440,24 @@
 {
 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 
+	if (object < dsp->dsa_resume_object) {
+		/*
+		 * Note: when resuming, we will visit all the dnodes in
+		 * the block of dnodes that we are resuming from.  In
+		 * this case it's unnecessary to send the dnodes prior to
+		 * the one we are resuming from.  We should be at most one
+		 * block's worth of dnodes behind the resume point.
+		 */
+		ASSERT3U(dsp->dsa_resume_object - object, <,
+		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+		return (0);
+	}
+
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(dsp, object, 1));
 
 	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_bytes(dsp, dsp->dsa_drr,
-		    sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dsp->dsa_pending_op = PENDING_NONE;
 	}
@@ -300,15 +474,18 @@
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
-	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
-		return (SET_ERROR(EINTR));
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 
-	if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
+	if (dump_record(dsp, DN_BONUS(dnp),
+	    P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
 		return (SET_ERROR(EINTR));
+	}
 
-	/* free anything past the end of the file */
+	/* Free anything past the end of the file. */
 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
-	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
 		return (SET_ERROR(EINTR));
 	if (dsp->dsa_err != 0)
 		return (SET_ERROR(EINTR));
@@ -315,57 +492,157 @@
 	return (0);
 }
 
-#define	BP_SPAN(dnp, level) \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+	if (!BP_IS_EMBEDDED(bp))
+		return (B_FALSE);
 
-/* ARGSUSED */
+	/*
+	 * Compression function must be legacy, or explicitly enabled.
+	 */
+	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+		return (B_FALSE);
+
+	/*
+	 * Embed type must be explicitly enabled.
+	 */
+	switch (BPE_GET_ETYPE(bp)) {
+	case BP_EMBEDDED_TYPE_DATA:
+		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+			return (B_TRUE);
+		break;
+	default:
+		return (B_FALSE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
 static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
-	dmu_sendarg_t *dsp = arg;
-	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+	struct send_thread_arg *sta = arg;
+	struct send_block_record *record;
+	uint64_t record_size;
 	int err = 0;
 
-	if (issig(JUSTLOOKING) && issig(FORREAL))
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= sta->resume.zb_object);
+
+	if (sta->cancel)
 		return (SET_ERROR(EINTR));
 
+	if (bp == NULL) {
+		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+		return (0);
+	} else if (zb->zb_level < 0) {
+		return (0);
+	}
+
+	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+	record->eos_marker = B_FALSE;
+	record->bp = *bp;
+	record->zb = *zb;
+	record->indblkshift = dnp->dn_indblkshift;
+	record->datablkszsec = dnp->dn_datablkszsec;
+	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	bqueue_enqueue(&sta->q, record, record_size);
+
+	return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset.  It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.  If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+	struct send_thread_arg *st_arg = arg;
+	int err;
+	struct send_block_record *data;
+
+	if (st_arg->ds != NULL) {
+		err = traverse_dataset_resume(st_arg->ds,
+		    st_arg->fromtxg, &st_arg->resume,
+		    st_arg->flags, send_cb, st_arg);
+
+		if (err != EINTR)
+			st_arg->error_code = err;
+	}
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	data->eos_marker = B_TRUE;
+	bqueue_enqueue(&st_arg->q, data, 1);
+	thread_exit();
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
+static int
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
+{
+	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+	const blkptr_t *bp = &data->bp;
+	const zbookmark_phys_t *zb = &data->zb;
+	uint8_t indblkshift = data->indblkshift;
+	uint16_t dblkszsec = data->datablkszsec;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+	int err = 0;
+
+	ASSERT3U(zb->zb_level, >=, 0);
+
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= dsa->dsa_resume_object);
+
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
-	} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+	} else if (BP_IS_HOLE(bp) &&
+	    zb->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
-	} else if (bp == NULL) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
-		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
+	} else if (BP_IS_HOLE(bp)) {
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+		uint64_t offset = zb->zb_blkid * span;
+		err = dump_free(dsa, zb->zb_object, offset, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
-		dnode_phys_t *blk;
-		int i;
 		int blksz = BP_GET_LSIZE(bp);
-		uint32_t aflags = ARC_WAIT;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 
+		ASSERT0(zb->zb_level);
+
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
-		blk = abuf->b_data;
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj = (zb->zb_blkid <<
-			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = dump_dnode(dsp, dnobj, blk+i);
+		dnode_phys_t *blk = abuf->b_data;
+		uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+		for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			err = dump_dnode(dsa, dnobj + i, blk + i);
 			if (err != 0)
 				break;
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	} else if (type == DMU_OT_SA) {
-		uint32_t aflags = ARC_WAIT;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
@@ -374,33 +651,62 @@
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
-		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
-		(void) arc_buf_remove_ref(abuf, &abuf);
-	} else { /* it's a level-0 block of a regular object */
-		uint32_t aflags = ARC_WAIT;
+		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+		arc_buf_destroy(abuf, &abuf);
+	} else if (backup_do_embed(dsa, bp)) {
+		/* it's an embedded level-0 block of a regular object */
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+		ASSERT0(zb->zb_level);
+		err = dump_write_embedded(dsa, zb->zb_object,
+		    zb->zb_blkid * blksz, blksz, bp);
+	} else {
+		/* it's a level-0 block of a regular object */
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
-		int blksz = BP_GET_LSIZE(bp);
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+		uint64_t offset;
 
+		ASSERT0(zb->zb_level);
+		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+		    (zb->zb_object == dsa->dsa_resume_object &&
+		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0) {
 			if (zfs_send_corrupt_data) {
 				/* Send a block filled with 0x"zfs badd bloc" */
-				abuf = arc_buf_alloc(spa, blksz, &abuf,
+				abuf = arc_alloc_buf(spa, blksz, &abuf,
 				    ARC_BUFC_DATA);
 				uint64_t *ptr;
 				for (ptr = abuf->b_data;
 				    (char *)ptr < (char *)abuf->b_data + blksz;
 				    ptr++)
-					*ptr = 0x2f5baddb10c;
+					*ptr = 0x2f5baddb10cULL;
 			} else {
 				return (SET_ERROR(EIO));
 			}
 		}
 
-		err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, bp, abuf->b_data);
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsa->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsa, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsa, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	ASSERT(err == 0 || err == EINTR);
@@ -408,14 +714,30 @@
 }
 
 /*
- * Releases dp, ds, and fromds, using the specified tag.
+ * Pop the new data off the queue, and free the old data.
  */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+	struct send_block_record *tmp = bqueue_dequeue(bq);
+	kmem_free(data, sizeof (*data));
+	return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
+ */
 static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+    zfs_bookmark_phys_t *ancestor_zb,
+    boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+    uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
-    dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off)
+    vnode_t *vp, offset_t *off)
 #else
-    dsl_dataset_t *fromds, int outfd, struct file *fp, offset_t *off)
+    struct file *fp, offset_t *off)
 #endif
 {
 	objset_t *os;
@@ -423,19 +745,11 @@
 	dmu_sendarg_t *dsp;
 	int err;
 	uint64_t fromtxg = 0;
+	uint64_t featureflags = 0;
+	struct send_thread_arg to_arg = { 0 };
 
-	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) {
-		dsl_dataset_rele(fromds, tag);
-		dsl_dataset_rele(ds, tag);
-		dsl_pool_rele(dp, tag);
-		return (SET_ERROR(EXDEV));
-	}
-
-	err = dmu_objset_from_ds(ds, &os);
+	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
-		if (fromds != NULL)
-			dsl_dataset_rele(fromds, tag);
-		dsl_dataset_rele(ds, tag);
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
@@ -451,38 +765,52 @@
 		uint64_t version;
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
 			kmem_free(drr, sizeof (dmu_replay_record_t));
-			if (fromds != NULL)
-				dsl_dataset_rele(fromds, tag);
-			dsl_dataset_rele(ds, tag);
 			dsl_pool_rele(dp, tag);
 			return (SET_ERROR(EINVAL));
 		}
 		if (version >= ZPL_VERSION_SA) {
-			DMU_SET_FEATUREFLAGS(
-			    drr->drr_u.drr_begin.drr_versioninfo,
-			    DMU_BACKUP_FEATURE_SA_SPILL);
+			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 		}
 	}
 #endif
 
+	if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+	if (embedok &&
+	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+	}
+
+	if (resumeobj != 0 || resumeoff != 0) {
+		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+	}
+
+	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+	    featureflags);
+
 	drr->drr_u.drr_begin.drr_creation_time =
-	    ds->ds_phys->ds_creation_time;
+	    dsl_dataset_phys(to_ds)->ds_creation_time;
 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
-	if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
+	if (is_clone)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+	if (zfs_send_set_freerecords_bit)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
-	if (fromds != NULL)
-		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
-	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
-	if (fromds != NULL) {
-		fromtxg = fromds->ds_phys->ds_creation_txg;
-		dsl_dataset_rele(fromds, tag);
-		fromds = NULL;
+	if (ancestor_zb != NULL) {
+		drr->drr_u.drr_begin.drr_fromguid =
+		    ancestor_zb->zbm_guid;
+		fromtxg = ancestor_zb->zbm_creation_txg;
 	}
+	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+	if (!to_ds->ds_is_snapshot) {
+		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+		    sizeof (drr->drr_u.drr_begin.drr_toname));
+	}
 
 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 
@@ -493,27 +821,82 @@
 	dsp->dsa_fp = fp;
 	dsp->dsa_os = os;
 	dsp->dsa_off = off;
-	dsp->dsa_toguid = ds->ds_phys->ds_guid;
-	ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
+	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
+	dsp->dsa_featureflags = featureflags;
+	dsp->dsa_resume_object = resumeobj;
+	dsp->dsa_resume_offset = resumeoff;
 
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_insert_head(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_insert_head(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
-	dsl_dataset_long_hold(ds, FTAG);
+	dsl_dataset_long_hold(to_ds, FTAG);
 	dsl_pool_rele(dp, tag);
 
-	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+	void *payload = NULL;
+	size_t payload_len = 0;
+	if (resumeobj != 0 || resumeoff != 0) {
+		dmu_object_info_t to_doi;
+		err = dmu_object_info(os, resumeobj, &to_doi);
+		if (err != 0)
+			goto out;
+		SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+		    resumeoff / to_doi.doi_data_block_size);
+
+		nvlist_t *nvl = fnvlist_alloc();
+		fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+		fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+		payload = fnvlist_pack(nvl, &payload_len);
+		drr->drr_payloadlen = payload_len;
+		fnvlist_free(nvl);
+	}
+
+	err = dump_record(dsp, payload, payload_len);
+	fnvlist_pack_free(payload, payload_len);
+	if (err != 0) {
 		err = dsp->dsa_err;
 		goto out;
 	}
 
-	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
-	    backup_cb, dsp);
+	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+	    offsetof(struct send_block_record, ln));
+	to_arg.error_code = 0;
+	to_arg.cancel = B_FALSE;
+	to_arg.ds = to_ds;
+	to_arg.fromtxg = fromtxg;
+	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
+	    TS_RUN, minclsyspri);
 
+	struct send_block_record *to_data;
+	to_data = bqueue_dequeue(&to_arg.q);
+
+	while (!to_data->eos_marker && err == 0) {
+		err = do_dump(dsp, to_data);
+		to_data = get_next_record(&to_arg.q, to_data);
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			err = EINTR;
+	}
+
+	if (err != 0) {
+		to_arg.cancel = B_TRUE;
+		while (!to_data->eos_marker) {
+			to_data = get_next_record(&to_arg.q, to_data);
+		}
+	}
+	kmem_free(to_data, sizeof (*to_data));
+
+	bqueue_destroy(&to_arg.q);
+
+	if (err == 0 && to_arg.error_code != 0)
+		err = to_arg.error_code;
+
+	if (err != 0)
+		goto out;
+
 	if (dsp->dsa_pending_op != PENDING_NONE)
-		if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
+		if (dump_record(dsp, NULL, 0) != 0)
 			err = SET_ERROR(EINTR);
 
 	if (err != 0) {
@@ -527,21 +910,20 @@
 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
-	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+	if (dump_record(dsp, NULL, 0) != 0)
 		err = dsp->dsa_err;
-		goto out;
-	}
 
 out:
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_remove(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_remove(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
+	VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
+
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
-	dsl_dataset_long_rele(ds, FTAG);
-	dsl_dataset_rele(ds, tag);
+	dsl_dataset_long_rele(to_ds, FTAG);
 
 	return (err);
 }
@@ -548,6 +930,7 @@
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
@@ -570,6 +953,9 @@
 	}
 
 	if (fromsnap != 0) {
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone;
+
 		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
@@ -576,34 +962,55 @@
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
+		if (!dsl_dataset_is_before(ds, fromds, 0))
+			err = SET_ERROR(EXDEV);
+		zb.zbm_creation_time =
+		    dsl_dataset_phys(fromds)->ds_creation_time;
+		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+		is_clone = (fromds->ds_dir != ds->ds_dir);
+		dsl_dataset_rele(fromds, FTAG);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, 0, 0, fp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, 0, 0, fp, off);
 	}
-
-	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, fp, off));
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
-    int outfd, vnode_t *vp, offset_t *off)
+    vnode_t *vp, offset_t *off)
 #else
-    int outfd, struct file *fp, offset_t *off)
+    struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
-	dsl_dataset_t *fromds = NULL;
 	int err;
+	boolean_t owned = B_FALSE;
 
-	if (strchr(tosnap, '@') == NULL)
+	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
 		return (SET_ERROR(EINVAL));
-	if (fromsnap != NULL && strchr(fromsnap, '@') == NULL)
-		return (SET_ERROR(EINVAL));
 
 	err = dsl_pool_hold(tosnap, FTAG, &dp);
 	if (err != 0)
 		return (err);
 
-	err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
+		/*
+		 * We are sending a filesystem or volume.  Ensure
+		 * that it doesn't change by owning the dataset.
+		 */
+		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+		owned = B_TRUE;
+	} else {
+		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+	}
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
@@ -610,16 +1017,91 @@
 	}
 
 	if (fromsnap != NULL) {
-		err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone = B_FALSE;
+		int fsnamelen = strchr(tosnap, '@') - tosnap;
+
+		/*
+		 * If the fromsnap is in a different filesystem, then
+		 * mark the send stream as a clone.
+		 */
+		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+		    (fromsnap[fsnamelen] != '@' &&
+		    fromsnap[fsnamelen] != '#')) {
+			is_clone = B_TRUE;
+		}
+
+		if (strchr(fromsnap, '@')) {
+			dsl_dataset_t *fromds;
+			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+			if (err == 0) {
+				if (!dsl_dataset_is_before(ds, fromds, 0))
+					err = SET_ERROR(EXDEV);
+				zb.zbm_creation_time =
+				    dsl_dataset_phys(fromds)->ds_creation_time;
+				zb.zbm_creation_txg =
+				    dsl_dataset_phys(fromds)->ds_creation_txg;
+				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+				is_clone = (ds->ds_dir != fromds->ds_dir);
+				dsl_dataset_rele(fromds, FTAG);
+			}
+		} else {
+			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
+		}
 		if (err != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, fp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, fp, off);
 	}
-	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, fp, off));
+	if (owned)
+		dsl_dataset_disown(ds, FTAG);
+	else
+		dsl_dataset_rele(ds, FTAG);
+	return (err);
 }
 
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
+    uint64_t *sizep)
+{
+	int err;
+	/*
+	 * Assume that space (both on-disk and in-stream) is dominated by
+	 * data.  We will adjust for indirect blocks and the copies property,
+	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+	 */
+
+	/*
+	 * Subtract out approximate space used by indirect blocks.
+	 * Assume most space is used by data blocks (non-indirect, non-dnode).
+	 * Assume all blocks are recordsize.  Assume ditto blocks and
+	 * internal fragmentation counter out compression.
+	 *
+	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+	 * block, which we observe in practice.
+	 */
+	uint64_t recordsize;
+	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+	if (err != 0)
+		return (err);
+	size -= size / recordsize * sizeof (blkptr_t);
+
+	/* Add in the space for the record associated with each block. */
+	size += size / recordsize * sizeof (dmu_replay_record_t);
+
+	*sizep = size;
+
+	return (0);
+}
+
 int
 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
@@ -630,19 +1112,23 @@
 	ASSERT(dsl_pool_config_held(dp));
 
 	/* tosnap must be a snapshot */
-	if (!dsl_dataset_is_snapshot(ds))
+	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
+	/* fromsnap, if provided, must be a snapshot */
+	if (fromds != NULL && !fromds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
 	/*
 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
 	 * or the origin's fs.
 	 */
-	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds))
+	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
 		return (SET_ERROR(EXDEV));
 
 	/* Get uncompressed size estimate of changed data. */
 	if (fromds == NULL) {
-		size = ds->ds_phys->ds_uncompressed_bytes;
+		size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 	} else {
 		uint64_t used, comp;
 		err = dsl_dataset_space_written(fromds, ds,
@@ -651,33 +1137,61 @@
 			return (err);
 	}
 
-	/*
-	 * Assume that space (both on-disk and in-stream) is dominated by
-	 * data.  We will adjust for indirect blocks and the copies property,
-	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
-	 */
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
+}
 
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed size
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	uint64_t *spaceptr = arg;
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		*spaceptr += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+    uint64_t *sizep)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	int err;
+	uint64_t size = 0;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	/* tosnap must be a snapshot */
+	if (!dsl_dataset_is_snapshot(ds))
+		return (SET_ERROR(EINVAL));
+
+	/* verify that from_txg is before the provided snapshot was taken */
+	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+		return (SET_ERROR(EXDEV));
+	}
+
 	/*
-	 * Subtract out approximate space used by indirect blocks.
-	 * Assume most space is used by data blocks (non-indirect, non-dnode).
-	 * Assume all blocks are recordsize.  Assume ditto blocks and
-	 * internal fragmentation counter out compression.
-	 *
-	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
-	 * block, which we observe in practice.
+	 * traverse the blocks of the snapshot with birth times after
+	 * from_txg, summing their uncompressed size
 	 */
-	uint64_t recordsize;
-	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
-	if (err != 0)
+	err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+	    dmu_calculate_send_traversal, &size);
+	if (err)
 		return (err);
-	size -= size / recordsize * sizeof (blkptr_t);
 
-	/* Add in the space for the record associated with each block. */
-	size += size / recordsize * sizeof (dmu_replay_record_t);
-
-	*sizep = size;
-
-	return (0);
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
 }
 
 typedef struct dmu_recv_begin_arg {
@@ -684,6 +1198,7 @@
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
+	uint64_t drba_snapobj;
 } dmu_recv_begin_arg_t;
 
 static int
@@ -694,14 +1209,9 @@
 	int error;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
-	/* must not have any changes since most recent snapshot */
-	if (!drba->drba_cookie->drc_force &&
-	    dsl_dataset_modified_since_lastsnap(ds))
-		return (SET_ERROR(ETXTBSY));
-
 	/* temporary clone name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EBUSY : error);
@@ -708,47 +1218,69 @@
 
 	/* new snapshot name must not exist */
 	error = zap_lookup(dp->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
-	    8, 1, &val);
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
 	if (error != ENOENT)
 		return (error == 0 ? EEXIST : error);
 
+	/*
+	 * Check snapshot limit before receiving. We'll recheck again at the
+	 * end, but might as well abort before receiving if we're already over
+	 * the limit.
+	 *
+	 * Note that we do not check the file system limit with
+	 * dsl_dir_fscount_check because the temporary %clones don't count
+	 * against that limit.
+	 */
+	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+	    NULL, drba->drba_cred);
+	if (error != 0)
+		return (error);
+
 	if (fromguid != 0) {
-		/* if incremental, most recent snapshot must match fromguid */
-		if (ds->ds_prev == NULL)
+		dsl_dataset_t *snap;
+		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+		/* Find snapshot in this dir that matches fromguid. */
+		while (obj != 0) {
+			error = dsl_dataset_hold_obj(dp, obj, FTAG,
+			    &snap);
+			if (error != 0)
+				return (SET_ERROR(ENODEV));
+			if (snap->ds_dir != ds->ds_dir) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+				break;
+			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+			dsl_dataset_rele(snap, FTAG);
+		}
+		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
-		/*
-		 * most recent snapshot must match fromguid, or there are no
-		 * changes since the fromguid one
-		 */
-		if (ds->ds_prev->ds_phys->ds_guid != fromguid) {
-			uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
-			uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
-			while (obj != 0) {
-				dsl_dataset_t *snap;
-				error = dsl_dataset_hold_obj(dp, obj, FTAG,
-				    &snap);
-				if (error != 0)
-					return (SET_ERROR(ENODEV));
-				if (snap->ds_phys->ds_creation_txg < birth) {
-					dsl_dataset_rele(snap, FTAG);
-					return (SET_ERROR(ENODEV));
-				}
-				if (snap->ds_phys->ds_guid == fromguid) {
-					dsl_dataset_rele(snap, FTAG);
-					break; /* it's ok */
-				}
-				obj = snap->ds_phys->ds_prev_snap_obj;
+		if (drba->drba_cookie->drc_force) {
+			drba->drba_snapobj = obj;
+		} else {
+			/*
+			 * If we are not forcing, there must be no
+			 * changes since fromsnap.
+			 */
+			if (dsl_dataset_modified_since_snap(ds, snap)) {
 				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ETXTBSY));
 			}
-			if (obj == 0)
-				return (SET_ERROR(ENODEV));
+			drba->drba_snapobj = ds->ds_prev->ds_object;
 		}
+
+		dsl_dataset_rele(snap, FTAG);
 	} else {
-		/* if full, most recent snapshot must be $ORIGIN */
-		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
-			return (SET_ERROR(ENODEV));
+		/* if full, then must be forced */
+		if (!drba->drba_cookie->drc_force)
+			return (SET_ERROR(EEXIST));
+		/* start from $ORIGIN@$ORIGIN, if supported */
+		drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+		    dp->dp_origin_snap->ds_object : 0;
 	}
 
 	return (0);
@@ -764,11 +1296,13 @@
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
@@ -777,18 +1311,42 @@
 		return (SET_ERROR(EINVAL));
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
-	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-	    DMU_BACKUP_FEATURE_SA_SPILL) &&
-	    spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
-	}
 
+	if (drba->drba_cookie->drc_resumable &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plan WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE) {
+		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
@@ -797,22 +1355,51 @@
 		dsl_dataset_rele(ds, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
-		char buf[MAXNAMELEN];
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
-		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
+		/*
+		 * If we're receiving a full send as a clone, and it doesn't
+		 * contain all the necessary free records and freeobject
+		 * records, reject it.
+		 */
+		if (fromguid == 0 && drba->drba_origin &&
+		    !(flags & DRR_FLAG_FREERECORDS))
+			return (SET_ERROR(EINVAL));
+
 		/* Open the parent of tofs */
-		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
+		ASSERT3U(strlen(tofs), <, sizeof (buf));
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
+		/*
+		 * Check filesystem and snapshot limits before receiving. We'll
+		 * recheck snapshot limits again at the end (we create the
+		 * filesystems and increment those counts during begin_sync).
+		 */
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold(dp, drba->drba_origin,
@@ -821,12 +1408,13 @@
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
-			if (!dsl_dataset_is_snapshot(origin)) {
+			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
-			if (origin->ds_phys->ds_guid != fromguid) {
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+			    fromguid != 0) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
@@ -844,21 +1432,29 @@
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	dsl_dataset_t *ds, *newds;
 	uint64_t dsobj;
 	int error;
-	uint64_t crflags;
+	uint64_t crflags = 0;
 
-	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
-	    DS_FLAG_CI_DATASET : 0;
+	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+		crflags |= DS_FLAG_CI_DATASET;
 
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* create temporary clone */
+		dsl_dataset_t *snap = NULL;
+		if (drba->drba_snapobj != 0) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    drba->drba_snapobj, FTAG, &snap));
+		}
 		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
-		    ds->ds_prev, crflags, drba->drba_cred, tx);
+		    snap, crflags, drba->drba_cred, tx);
+		if (drba->drba_snapobj != 0)
+			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 	} else {
 		dsl_dir_t *dd;
@@ -883,17 +1479,44 @@
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+	if (drba->drba_cookie->drc_resumable) {
+		dsl_dataset_zapify(newds, tx);
+		if (drrb->drr_fromguid != 0) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+			    8, 1, &drrb->drr_fromguid, tx));
+		}
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+		    8, 1, &drrb->drr_toguid, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+		uint64_t one = 1;
+		uint64_t zero = 0;
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+		    8, 1, &one, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+		    8, 1, &zero, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+		    8, 1, &zero, tx));
+		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+		    DMU_BACKUP_FEATURE_EMBED_DATA) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+			    8, 1, &one, tx));
+		}
+	}
+
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
-	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * If we actually created a non-clone, we need to create the
 	 * objset in our new dataset.
 	 */
+	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
+	rrw_exit(&newds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = newds;
 
@@ -900,67 +1523,266 @@
 	spa_history_log_internal_ds(newds, "receive", tx, "");
 }
 
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES)
+		return (SET_ERROR(EINVAL));
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plain WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+		if (error != 0)
+			return (error);
+	}
+
+	/* check that ds is marked inconsistent */
+	if (!DS_IS_INCONSISTENT(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* check that there is resuming data, and that the toguid matches */
+	if (!dsl_dataset_is_zapified(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	uint64_t val;
+	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+	if (error != 0 || drrb->drr_toguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Check if the receive is still running.  If so, it will be owned.
+	 * Note that nothing else can own the dataset (e.g. after the receive
+	 * fails) because it will be marked inconsistent.
+	 */
+	if (dsl_dataset_has_owner(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/* There should not be any snapshots of this fs yet. */
+	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: resume point will be checked when we process the first WRITE
+	 * record.
+	 */
+
+	/* check that the origin matches */
+	val = 0;
+	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+	if (drrb->drr_fromguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+
+	/* clear the inconsistent flag so that we can own it */
+	ASSERT(DS_IS_INCONSISTENT(ds));
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	drba->drba_cookie->drc_ds = ds;
+
+	spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
-	dmu_replay_record_t *drr;
 
 	bzero(drc, sizeof (dmu_recv_cookie_t));
-	drc->drc_drrb = drrb;
+	drc->drc_drr_begin = drr_begin;
+	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
+	drc->drc_resumable = resumable;
+	drc->drc_cred = CRED();
 
-	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
-	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
-		return (SET_ERROR(EINVAL));
-
-	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-	drr->drr_type = DRR_BEGIN;
-	drr->drr_u.drr_begin = *drc->drc_drrb;
-	if (drc->drc_byteswap) {
-		fletcher_4_incremental_byteswap(drr,
+		fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+		byteswap_record(drr_begin);
+	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		fletcher_4_incremental_native(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
-		fletcher_4_incremental_native(drr,
-		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+		return (SET_ERROR(EINVAL));
 	}
-	kmem_free(drr, sizeof (dmu_replay_record_t));
 
-	if (drc->drc_byteswap) {
-		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-		drrb->drr_type = BSWAP_32(drrb->drr_type);
-		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
-	}
-
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 
-	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
-	    &drba, 5));
+	if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_RESUMING) {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	} else  {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_begin_check, dmu_recv_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	}
 }
 
-struct restorearg {
+struct receive_record_arg {
+	dmu_replay_record_t header;
+	void *payload; /* Pointer to a buffer containing the payload */
+	/*
+	 * If the record is a write, pointer to the arc_buf_t containing the
+	 * payload.
+	 */
+	arc_buf_t *write_buf;
+	int payload_size;
+	uint64_t bytes_read; /* bytes read from stream when record created */
+	boolean_t eos_marker; /* Marks the end of the stream */
+	bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+	objset_t *os;
+	boolean_t byteswap;
+	bqueue_t q;
+
+	/*
+	 * These three args are used to signal to the main thread that we're
+	 * done.
+	 */
+	kmutex_t mutex;
+	kcondvar_t cv;
+	boolean_t done;
+
 	int err;
-	boolean_t byteswap;
+	/* A map from guid to dataset to help handle dedup'd streams. */
+	avl_tree_t *guid_to_ds_map;
+	boolean_t resumable;
+	uint64_t last_object, last_offset;
+	uint64_t bytes_read; /* bytes read when current record created */
+};
+
+struct objlist {
+	list_t list; /* List of struct receive_objnode. */
+	/*
+	 * Last object looked up. Used to assert that objects are being looked
+	 * up in ascending order.
+	 */
+	uint64_t last_lookup;
+};
+
+struct receive_objnode {
+	list_node_t node;
+	uint64_t object;
+};
+
+struct receive_arg  {
+	objset_t *os;
 	kthread_t *td;
 	struct file *fp;
-	char *buf;
-	uint64_t voff;
-	int bufsize; /* amount of memory allocated for buf */
+	uint64_t voff; /* The current offset in the stream */
+	uint64_t bytes_read;
+	/*
+	 * A record that has had its payload read in, but hasn't yet been handed
+	 * off to the worker thread.
+	 */
+	struct receive_record_arg *rrd;
+	/* A record that has had its header read in, but not its payload. */
+	struct receive_record_arg *next_rrd;
 	zio_cksum_t cksum;
-	avl_tree_t *guid_to_ds_map;
+	zio_cksum_t prev_cksum;
+	int err;
+	boolean_t byteswap;
+	/* Sorted list of objects not to issue prefetches for. */
+	struct objlist ignore_objlist;
 };
 
 typedef struct guid_map_entry {
@@ -999,7 +1821,7 @@
 }
 
 static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
 {
 	struct uio auio;
 	struct iovec aiov;
@@ -1024,45 +1846,50 @@
 	return (error);
 }
 
-static void *
-restore_read(struct restorearg *ra, int len)
+static int
+receive_read(struct receive_arg *ra, int len, void *buf)
 {
-	void *rv;
 	int done = 0;
 
-	/* some things will require 8-byte alignment, so everything must */
+	/*
+	 * The code doesn't rely on this (lengths being multiples of 8).  See
+	 * comment in dump_bytes.
+	 */
 	ASSERT0(len % 8);
 
 	while (done < len) {
 		ssize_t resid;
 
-		ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
+		ra->err = restore_bytes(ra, buf + done,
 		    len - done, ra->voff, &resid);
 
-		if (resid == len - done)
-			ra->err = SET_ERROR(EINVAL);
+		if (resid == len - done) {
+			/*
+			 * Note: ECKSUM indicates that the receive
+			 * was interrupted and can potentially be resumed.
+			 */
+			ra->err = SET_ERROR(ECKSUM);
+		}
 		ra->voff += len - done - resid;
 		done = len - resid;
 		if (ra->err != 0)
-			return (NULL);
+			return (ra->err);
 	}
 
+	ra->bytes_read += len;
+
 	ASSERT3U(done, ==, len);
-	rv = ra->buf;
-	if (ra->byteswap)
-		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
-	else
-		fletcher_4_incremental_native(rv, len, &ra->cksum);
-	return (rv);
+	return (0);
 }
 
 static void
-backup_byteswap(dmu_replay_record_t *drr)
+byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
@@ -1075,7 +1902,6 @@
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
-		/* DO64(drr_object.drr_allocation_txg); */
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
@@ -1093,10 +1919,7 @@
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
 		DO64(drr_write.drr_toguid);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_BYREF:
@@ -1107,12 +1930,18 @@
 		DO64(drr_write_byref.drr_refguid);
 		DO64(drr_write_byref.drr_refobject);
 		DO64(drr_write_byref.drr_refoffset);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+		    drr_key.ddk_cksum);
 		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
+	case DRR_WRITE_EMBEDDED:
+		DO64(drr_write_embedded.drr_object);
+		DO64(drr_write_embedded.drr_offset);
+		DO64(drr_write_embedded.drr_length);
+		DO64(drr_write_embedded.drr_toguid);
+		DO32(drr_write_embedded.drr_lsize);
+		DO32(drr_write_embedded.drr_psize);
+		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
@@ -1125,23 +1954,75 @@
 		DO64(drr_spill.drr_toguid);
 		break;
 	case DRR_END:
-		DO64(drr_end.drr_checksum.zc_word[0]);
-		DO64(drr_end.drr_checksum.zc_word[1]);
-		DO64(drr_end.drr_checksum.zc_word[2]);
-		DO64(drr_end.drr_checksum.zc_word[3]);
 		DO64(drr_end.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	}
+
+	if (drr->drr_type != DRR_BEGIN) {
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+	}
+
 #undef DO64
 #undef DO32
 }
 
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+	if (bonus_type == DMU_OT_SA) {
+		return (1);
+	} else {
+		return (1 +
+		    ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+	}
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+    uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	if (!rwa->resumable)
+		return;
+
+	/*
+	 * We use ds_resume_bytes[] != 0 to indicate that we need to
+	 * update this on disk, so it must not be 0.
+	 */
+	ASSERT(rwa->bytes_read != 0);
+
+	/*
+	 * We only resume from write records, which have a valid
+	 * (non-meta-dnode) object number.
+	 */
+	ASSERT(object != 0);
+
+	/*
+	 * For resuming to work correctly, we must receive records in order,
+	 * sorted by object,offset.  This is checked by the callers, but
+	 * assert it here for good measure.
+	 */
+	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+	ASSERT3U(rwa->bytes_read, >=,
+	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
 static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+    void *data)
 {
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object;
 	int err;
-	dmu_tx_t *tx;
-	void *data = NULL;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
@@ -1150,66 +2031,78 @@
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
 
-	err = dmu_object_info(os, drro->drr_object, NULL);
+	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
 		return (SET_ERROR(EINVAL));
+	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
-	if (drro->drr_bonuslen) {
-		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
-		if (ra->err != 0)
-			return (ra->err);
+	/*
+	 * If we are losing blkptrs or changing the block size this must
+	 * be a new file instance.  We must clear out the previous file
+	 * contents before we can change this type of metadata in the dnode.
+	 */
+	if (err == 0) {
+		int nblkptr;
+
+		nblkptr = deduce_nblkptr(drro->drr_bonustype,
+		    drro->drr_bonuslen);
+
+		if (drro->drr_blksz != doi.doi_data_block_size ||
+		    nblkptr < doi.doi_nblkptr) {
+			err = dmu_free_long_range(rwa->os, drro->drr_object,
+			    0, DMU_OBJECT_END);
+			if (err != 0)
+				return (SET_ERROR(EINVAL));
+		}
 	}
 
-	if (err == ENOENT) {
+	tx = dmu_tx_create(rwa->os);
+	dmu_tx_hold_bonus(tx, object);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	if (object == DMU_NEW_OBJECT) {
 		/* currently free, want to be allocated */
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err != 0) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-		err = dmu_object_claim(os, drro->drr_object,
+		err = dmu_object_claim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
-		dmu_tx_commit(tx);
-	} else {
-		/* currently allocated, want to be allocated */
-		err = dmu_object_reclaim(os, drro->drr_object,
+	} else if (drro->drr_type != doi.doi_type ||
+	    drro->drr_blksz != doi.doi_data_block_size ||
+	    drro->drr_bonustype != doi.doi_bonus_type ||
+	    drro->drr_bonuslen != doi.doi_bonus_size) {
+		/* currently allocated, but with different properties */
+		err = dmu_object_reclaim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen);
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	}
 	if (err != 0) {
+		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, drro->drr_object);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
+	dmu_object_set_checksum(rwa->os, drro->drr_object,
+	    drro->drr_checksumtype, tx);
+	dmu_object_set_compress(rwa->os, drro->drr_object,
+	    drro->drr_compress, tx);
 
-	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
-	    tx);
-	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
-
 	if (data != NULL) {
 		dmu_buf_t *db;
 
-		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
-		if (ra->byteswap) {
+		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
@@ -1218,40 +2111,43 @@
 		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
+
 	return (0);
 }
 
 /* ARGSUSED */
 static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
+receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
+	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
-		if (dmu_object_info(os, obj, NULL) != 0)
+		if (dmu_object_info(rwa->os, obj, NULL) != 0)
 			continue;
 
-		err = dmu_free_object(os, obj);
+		err = dmu_free_long_object(rwa->os, obj);
 		if (err != 0)
 			return (err);
 	}
+	if (next_err != ESRCH)
+		return (next_err);
 	return (0);
 }
 
 static int
-restore_write(struct restorearg *ra, objset_t *os,
-    struct drr_write *drrw)
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+    arc_buf_t *abuf)
 {
 	dmu_tx_t *tx;
-	void *data;
 	int err;
 
 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
@@ -1258,14 +2154,22 @@
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
-	data = restore_read(ra, drrw->drr_length);
-	if (data == NULL)
-		return (ra->err);
+	/*
+	 * For resuming to work, records must be in increasing order
+	 * by (object, offset).
+	 */
+	if (drrw->drr_object < rwa->last_object ||
+	    (drrw->drr_object == rwa->last_object &&
+	    drrw->drr_offset < rwa->last_offset)) {
+		return (SET_ERROR(EINVAL));
+	}
+	rwa->last_object = drrw->drr_object;
+	rwa->last_offset = drrw->drr_offset;
 
-	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	tx = dmu_tx_create(os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
@@ -1274,14 +2178,28 @@
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	if (ra->byteswap) {
+	if (rwa->byteswap) {
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(drrw->drr_type);
-		dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
+		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+		    drrw->drr_length);
 	}
-	dmu_write(os, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_length, data, tx);
+
+	dmu_buf_t *bonus;
+	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
+		return (SET_ERROR(EINVAL));
+	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+	/*
+	 * Note: If the receive fails, we want the resume stream to start
+	 * with the same record that we last successfully received (as opposed
+	 * to the next record), so that we can verify that we are
+	 * resuming from the correct location.
+	 */
+	save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 	dmu_tx_commit(tx);
+	dmu_buf_rele(bonus, FTAG);
+
 	return (0);
 }
 
@@ -1293,7 +2211,7 @@
  * data from the stream to fulfill this write.
  */
 static int
-restore_write_byref(struct restorearg *ra, objset_t *os,
+receive_write_byref(struct receive_writer_arg *rwa,
     struct drr_write_byref *drrwbr)
 {
 	dmu_tx_t *tx;
@@ -1300,7 +2218,7 @@
 	int err;
 	guid_map_entry_t gmesrch;
 	guid_map_entry_t *gmep;
-	avl_index_t	where;
+	avl_index_t where;
 	objset_t *ref_os = NULL;
 	dmu_buf_t *dbp;
 
@@ -1313,7 +2231,7 @@
 	 */
 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
 		gmesrch.guid = drrwbr->drr_refguid;
-		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
 		    &where)) == NULL) {
 			return (SET_ERROR(EINVAL));
 		}
@@ -1320,14 +2238,15 @@
 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
 			return (SET_ERROR(EINVAL));
 	} else {
-		ref_os = os;
+		ref_os = rwa->os;
 	}
 
-	if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
-	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+	if (err != 0)
 		return (err);
 
-	tx = dmu_tx_create(os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
@@ -1336,39 +2255,77 @@
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	dmu_write(os, drrwbr->drr_object,
+	dmu_write(rwa->os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
-restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+receive_write_embedded(struct receive_writer_arg *rwa,
+    struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
-	void *data;
+	int err;
+
+	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+		return (EINVAL);
+
+	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+		return (EINVAL);
+
+	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+		return (EINVAL);
+	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+		return (EINVAL);
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_write(tx, drrwe->drr_object,
+	    drrwe->drr_offset, drrwe->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	dmu_write_embedded(rwa->os, drrwe->drr_object,
+	    drrwe->drr_offset, data, drrwe->drr_etype,
+	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+    void *data)
+{
+	dmu_tx_t *tx;
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > SPA_MAXBLOCKSIZE)
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
-	data = restore_read(ra, drrs->drr_length);
-	if (data == NULL)
-		return (ra->err);
-
-	if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
-	tx = dmu_tx_create(os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
@@ -1395,8 +2352,7 @@
 
 /* ARGSUSED */
 static int
-restore_free(struct restorearg *ra, objset_t *os,
-    struct drr_free *drrf)
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
@@ -1404,11 +2360,12 @@
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	err = dmu_free_long_range(os, drrf->drr_object,
+	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
+
 	return (err);
 }
 
@@ -1416,13 +2373,445 @@
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
-	char name[MAXNAMELEN];
-	dsl_dataset_name(drc->drc_ds, name);
-	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-	(void) dsl_destroy_head(name);
+	if (drc->drc_resumable) {
+		/* wait for our resume state to be written to disk */
+		txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	} else {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_name(drc->drc_ds, name);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+		(void) dsl_destroy_head(name);
+	}
 }
 
+static void
+receive_cksum(struct receive_arg *ra, int len, void *buf)
+{
+	if (ra->byteswap) {
+		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+	} else {
+		fletcher_4_incremental_native(buf, len, &ra->cksum);
+	}
+}
+
 /*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
+{
+	int err;
+
+	if (len != 0) {
+		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+		err = receive_read(ra, len, buf);
+		if (err != 0)
+			return (err);
+		receive_cksum(ra, len, buf);
+
+		/* note: rrd is NULL when reading the begin record's payload */
+		if (ra->rrd != NULL) {
+			ra->rrd->payload = buf;
+			ra->rrd->payload_size = len;
+			ra->rrd->bytes_read = ra->bytes_read;
+		}
+	}
+
+	ra->prev_cksum = ra->cksum;
+
+	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+	err = receive_read(ra, sizeof (ra->next_rrd->header),
+	    &ra->next_rrd->header);
+	ra->next_rrd->bytes_read = ra->bytes_read;
+	if (err != 0) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (err);
+	}
+	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: checksum is of everything up to but not including the
+	 * checksum itself.
+	 */
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	receive_cksum(ra,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &ra->next_rrd->header);
+
+	zio_cksum_t cksum_orig =
+	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+	zio_cksum_t *cksump =
+	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+	if (ra->byteswap)
+		byteswap_record(&ra->next_rrd->header);
+
+	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (SET_ERROR(ECKSUM));
+	}
+
+	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+	return (0);
+}
+
+static void
+objlist_create(struct objlist *list)
+{
+	list_create(&list->list, sizeof (struct receive_objnode),
+	    offsetof(struct receive_objnode, node));
+	list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+	for (struct receive_objnode *n = list_remove_head(&list->list);
+	    n != NULL; n = list_remove_head(&list->list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist.  In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number.  Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = list_head(&list->list);
+	ASSERT3U(object, >=, list->last_lookup);
+	list->last_lookup = object;
+	while (node != NULL && node->object < object) {
+		VERIFY3P(node, ==, list_remove_head(&list->list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&list->list);
+	}
+	return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order.  However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+	node->object = object;
+#ifdef ZFS_DEBUG
+	struct receive_objnode *last_object = list_tail(&list->list);
+	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+	ASSERT3U(node->object, >, last_objnum);
+#endif
+	list_insert_tail(&list->list, node);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object.  We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches).  We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records).  As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+    uint64_t object, uint64_t offset, uint64_t length)
+{
+	if (!objlist_exists(&ra->ignore_objlist, object)) {
+		dmu_prefetch(ra->os, object, 1, offset, length,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
+static int
+receive_read_record(struct receive_arg *ra)
+{
+	int err;
+
+	switch (ra->rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+		dmu_object_info_t doi;
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
+		err = dmu_object_info(ra->os, drro->drr_object, &doi);
+		/*
+		 * See receive_read_prefetch for an explanation why we're
+		 * storing this object in the ignore_obj_list.
+		 */
+		if (err == ENOENT ||
+		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+			objlist_insert(&ra->ignore_objlist, drro->drr_object);
+			err = 0;
+		}
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
+		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+		    drrw->drr_length);
+
+		err = receive_read_payload_and_next_header(ra,
+		    drrw->drr_length, abuf->b_data);
+		if (err != 0) {
+			dmu_return_arcbuf(abuf);
+			return (err);
+		}
+		ra->rrd->write_buf = abuf;
+		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+		    drrw->drr_length);
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwb =
+		    &ra->rrd->header.drr_u.drr_write_byref;
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+		    drrwb->drr_length);
+		return (err);
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &ra->rrd->header.drr_u.drr_write_embedded;
+		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
+
+		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+		    drrwe->drr_length);
+		return (err);
+	}
+	case DRR_FREE:
+	{
+		/*
+		 * It might be beneficial to prefetch indirect blocks here, but
+		 * we don't really have the data to decide for sure.
+		 */
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
+	}
+	case DRR_END:
+	{
+		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
+		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+			return (SET_ERROR(ECKSUM));
+		return (0);
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+		    buf);
+		if (err != 0)
+			kmem_free(buf, drrs->drr_length);
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
+{
+	int err;
+
+	/* Processing in order, therefore bytes_read should be increasing. */
+	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+	rwa->bytes_read = rrd->bytes_read;
+
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		err = receive_object(rwa, drro, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		return (receive_freeobjects(rwa, drrfo));
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		err = receive_write(rwa, drrw, rrd->write_buf);
+		/* if receive_write() is successful, it consumes the arc_buf */
+		if (err != 0)
+			dmu_return_arcbuf(rrd->write_buf);
+		rrd->write_buf = NULL;
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &rrd->header.drr_u.drr_write_byref;
+		return (receive_write_byref(rwa, drrwbr));
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		err = receive_write_embedded(rwa, drrwe, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		return (receive_free(rwa, drrf));
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		err = receive_spill(rwa, drrs, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record  When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+	struct receive_writer_arg *rwa = arg;
+	struct receive_record_arg *rrd;
+	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+	    rrd = bqueue_dequeue(&rwa->q)) {
+		/*
+		 * If there's an error, the main thread will stop putting things
+		 * on the queue, but we need to clear everything in it before we
+		 * can exit.
+		 */
+		if (rwa->err == 0) {
+			rwa->err = receive_process_record(rwa, rrd);
+		} else if (rrd->write_buf != NULL) {
+			dmu_return_arcbuf(rrd->write_buf);
+			rrd->write_buf = NULL;
+			rrd->payload = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		kmem_free(rrd, sizeof (*rrd));
+	}
+	kmem_free(rrd, sizeof (*rrd));
+	mutex_enter(&rwa->mutex);
+	rwa->done = B_TRUE;
+	cv_signal(&rwa->cv);
+	mutex_exit(&rwa->mutex);
+	thread_exit();
+}
+
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+	uint64_t val;
+	objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+	uint64_t dsobj = dmu_objset_id(ra->os);
+	uint64_t resume_obj, resume_off;
+
+	if (nvlist_lookup_uint64(begin_nvl,
+	    "resume_object", &resume_obj) != 0 ||
+	    nvlist_lookup_uint64(begin_nvl,
+	    "resume_offset", &resume_off) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+	if (resume_obj != val)
+		return (SET_ERROR(EINVAL));
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+	if (resume_off != val)
+		return (SET_ERROR(EINVAL));
+
+	return (0);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool.  There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks.  It will then push the records
+ * onto an internal blocking queue.  The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU.  This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
@@ -1429,11 +2818,11 @@
 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep)
 {
-	struct restorearg ra = { 0 };
-	dmu_replay_record_t *drr;
-	objset_t *os;
-	zio_cksum_t pcksum;
+	int err = 0;
+	struct receive_arg ra = { 0 };
+	struct receive_writer_arg rwa = { 0 };
 	int featureflags;
+	nvlist_t *begin_nvl = NULL;
 
 	ra.byteswap = drc->drc_byteswap;
 	ra.cksum = drc->drc_cksum;
@@ -1440,9 +2829,15 @@
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
-	ra.bufsize = 1<<20;
-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
+	if (dsl_dataset_is_zapified(drc->drc_ds)) {
+		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
+	}
+
+	objlist_create(&ra.ignore_objlist);
+
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
@@ -1451,9 +2846,9 @@
 	/*
 	 * Open the objset we are modifying.
 	 */
-	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
+	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
 
-	ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
@@ -1472,119 +2867,131 @@
 		}
 
 		if (*action_handlep == 0) {
-			ra.guid_to_ds_map =
+			rwa.guid_to_ds_map =
 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-			avl_create(ra.guid_to_ds_map, guid_compare,
+			avl_create(rwa.guid_to_ds_map, guid_compare,
 			    sizeof (guid_map_entry_t),
 			    offsetof(guid_map_entry_t, avlnode));
-			ra.err = zfs_onexit_add_cb(minor,
-			    free_guid_map_onexit, ra.guid_to_ds_map,
+			err = zfs_onexit_add_cb(minor,
+			    free_guid_map_onexit, rwa.guid_to_ds_map,
 			    action_handlep);
 			if (ra.err != 0)
 				goto out;
 		} else {
-			ra.err = zfs_onexit_cb_data(minor, *action_handlep,
-			    (void **)&ra.guid_to_ds_map);
+			err = zfs_onexit_cb_data(minor, *action_handlep,
+			    (void **)&rwa.guid_to_ds_map);
 			if (ra.err != 0)
 				goto out;
 		}
 
-		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
 	}
 
+	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+	void *payload = NULL;
+	if (payloadlen != 0)
+		payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+	err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
+	if (err != 0) {
+		if (payloadlen != 0)
+			kmem_free(payload, payloadlen);
+		goto out;
+	}
+	if (payloadlen != 0) {
+		err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+		kmem_free(payload, payloadlen);
+		if (err != 0)
+			goto out;
+	}
+
+	if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+		err = resume_check(&ra, begin_nvl);
+		if (err != 0)
+			goto out;
+	}
+
+	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+	    offsetof(struct receive_record_arg, node));
+	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+	rwa.os = ra.os;
+	rwa.byteswap = drc->drc_byteswap;
+	rwa.resumable = drc->drc_resumable;
+
+	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
+	    TS_RUN, minclsyspri);
 	/*
-	 * Read records and process them.
+	 * We're reading rwa.err without locks, which is safe since we are the
+	 * only reader, and the worker thread is the only writer.  It's ok if we
+	 * miss a write for an iteration or two of the loop, since the writer
+	 * thread will keep freeing records we send it until we send it an eos
+	 * marker.
+	 *
+	 * We can leave this loop in 3 ways:  First, if rwa.err is
+	 * non-zero.  In that case, the writer thread will free the rrd we just
+	 * pushed.  Second, if  we're interrupted; in that case, either it's the
+	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+	 * has been handed off to the writer thread who will free it.  Finally,
+	 * if receive_read_record fails or we're at the end of the stream, then
+	 * we free ra.rrd and exit.
 	 */
-	pcksum = ra.cksum;
-	while (ra.err == 0 &&
-	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+	while (rwa.err == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			ra.err = SET_ERROR(EINTR);
-			goto out;
+			err = SET_ERROR(EINTR);
+			break;
 		}
 
-		if (ra.byteswap)
-			backup_byteswap(drr);
+		ASSERT3P(ra.rrd, ==, NULL);
+		ra.rrd = ra.next_rrd;
+		ra.next_rrd = NULL;
+		/* Allocates and loads header into ra.next_rrd */
+		err = receive_read_record(&ra);
 
-		switch (drr->drr_type) {
-		case DRR_OBJECT:
-		{
-			/*
-			 * We need to make a copy of the record header,
-			 * because restore_{object,write} may need to
-			 * restore_read(), which will invalidate drr.
-			 */
-			struct drr_object drro = drr->drr_u.drr_object;
-			ra.err = restore_object(&ra, os, &drro);
+		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+			kmem_free(ra.rrd, sizeof (*ra.rrd));
+			ra.rrd = NULL;
 			break;
 		}
-		case DRR_FREEOBJECTS:
-		{
-			struct drr_freeobjects drrfo =
-			    drr->drr_u.drr_freeobjects;
-			ra.err = restore_freeobjects(&ra, os, &drrfo);
-			break;
-		}
-		case DRR_WRITE:
-		{
-			struct drr_write drrw = drr->drr_u.drr_write;
-			ra.err = restore_write(&ra, os, &drrw);
-			break;
-		}
-		case DRR_WRITE_BYREF:
-		{
-			struct drr_write_byref drrwbr =
-			    drr->drr_u.drr_write_byref;
-			ra.err = restore_write_byref(&ra, os, &drrwbr);
-			break;
-		}
-		case DRR_FREE:
-		{
-			struct drr_free drrf = drr->drr_u.drr_free;
-			ra.err = restore_free(&ra, os, &drrf);
-			break;
-		}
-		case DRR_END:
-		{
-			struct drr_end drre = drr->drr_u.drr_end;
-			/*
-			 * We compare against the *previous* checksum
-			 * value, because the stored checksum is of
-			 * everything before the DRR_END record.
-			 */
-			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
-				ra.err = SET_ERROR(ECKSUM);
-			goto out;
-		}
-		case DRR_SPILL:
-		{
-			struct drr_spill drrs = drr->drr_u.drr_spill;
-			ra.err = restore_spill(&ra, os, &drrs);
-			break;
-		}
-		default:
-			ra.err = SET_ERROR(EINVAL);
-			goto out;
-		}
-		pcksum = ra.cksum;
+
+		bqueue_enqueue(&rwa.q, ra.rrd,
+		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+		ra.rrd = NULL;
 	}
-	ASSERT(ra.err != 0);
+	if (ra.next_rrd == NULL)
+		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+	ra.next_rrd->eos_marker = B_TRUE;
+	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
 
+	mutex_enter(&rwa.mutex);
+	while (!rwa.done) {
+		cv_wait(&rwa.cv, &rwa.mutex);
+	}
+	mutex_exit(&rwa.mutex);
+
+	cv_destroy(&rwa.cv);
+	mutex_destroy(&rwa.mutex);
+	bqueue_destroy(&rwa.q);
+	if (err == 0)
+		err = rwa.err;
+
 out:
+	nvlist_free(begin_nvl);
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
 		zfs_onexit_fd_rele(cleanup_fd);
 
-	if (ra.err != 0) {
+	if (err != 0) {
 		/*
-		 * destroy what we created, so we don't leave it in the
-		 * inconsistent restoring state.
+		 * Clean up references. If receive is not resumable,
+		 * destroy what we created, so we don't leave it in
+		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 	}
 
-	kmem_free(ra.buf, ra.bufsize);
 	*voffp = ra.voff;
-	return (ra.err);
+	objlist_destroy(&ra.ignore_objlist);
+	return (err);
 }
 
 static int
@@ -1602,14 +3009,47 @@
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
+		if (drc->drc_force) {
+			/*
+			 * We will destroy any snapshots in tofs (i.e. before
+			 * origin_head) that are after the origin (which is
+			 * the snap before drc_ds, because drc_ds can not
+			 * have any snaps of its own).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				error = dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap);
+				if (error != 0)
+					break;
+				if (snap->ds_dir != origin_head->ds_dir)
+					error = SET_ERROR(EINVAL);
+				if (error == 0)  {
+					error = dsl_destroy_snapshot_check_impl(
+					    snap, B_FALSE);
+				}
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_dataset_rele(snap, FTAG);
+				if (error != 0)
+					break;
+			}
+			if (error != 0) {
+				dsl_dataset_rele(origin_head, FTAG);
+				return (error);
+			}
+		}
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
-		    origin_head, drc->drc_force);
+		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
-		    drc->drc_tosnap, tx);
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
@@ -1617,7 +3057,7 @@
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
-		    drc->drc_tosnap, tx);
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
 	}
 	return (error);
 }
@@ -1636,6 +3076,30 @@
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
+
+		if (drc->drc_force) {
+			/*
+			 * Destroy any snapshots of drc_tofs (origin_head)
+			 * after the origin (the snap before drc_ds).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap));
+				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_destroy_snapshot_sync_impl(snap,
+				    B_FALSE, tx);
+				dsl_dataset_rele(snap, FTAG);
+			}
+		}
+		VERIFY3P(drc->drc_ds->ds_prev, ==,
+		    origin_head->ds_prev);
+
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		dsl_dataset_snapshot_sync_impl(origin_head,
@@ -1643,18 +3107,25 @@
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
-		origin_head->ds_prev->ds_phys->ds_creation_time =
+		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
-		origin_head->ds_prev->ds_phys->ds_guid =
+		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
-		origin_head->ds_prev->ds_phys->ds_flags &=
+		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
-		origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(origin_head)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
 
+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+		if (drc->drc_owner != NULL)
+			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
@@ -1662,15 +3133,32 @@
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		ds->ds_prev->ds_phys->ds_creation_time =
+		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
-		ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
-		ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(ds->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		if (dsl_dataset_has_resume_receive_state(ds)) {
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_FROMGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OBJECT, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OFFSET, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_BYTES, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TOGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TONAME, tx);
+		}
+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	}
-	drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode,
@@ -1696,7 +3184,7 @@
 	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
 	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
 	if (err == 0) {
-		gmep->guid = snapds->ds_phys->ds_guid;
+		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
 		gmep->gme_ds = snapds;
 		avl_add(guid_map, gmep);
 		dsl_dataset_long_hold(snapds, gmep);
@@ -1712,36 +3200,41 @@
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
-	int error;
-	char name[MAXNAMELEN];
-
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
-	error = dsl_sync_task(drc->drc_tofs,
+	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks);
-
-	if (error != 0)
-		dmu_recv_cleanup_ds(drc);
-	return (error);
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
 	int error;
 
-	error = dsl_sync_task(drc->drc_tofs,
-	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks);
+	drc->drc_owner = owner;
 
+	if (drc->drc_newfs)
+		error = dmu_recv_new_end(drc);
+	else
+		error = dmu_recv_existing_end(drc);
+
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 	} else if (drc->drc_guid_to_ds_map != NULL) {
@@ -1752,11 +3245,12 @@
 	return (error);
 }
 
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
 {
-	if (drc->drc_newfs)
-		return (dmu_recv_new_end(drc));
-	else
-		return (dmu_recv_existing_end(drc));
+	return (os->os_dsl_dataset != NULL &&
+	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2015 Chunwei Chen. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -37,17 +38,25 @@
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
+#include <sys/zfeature.h>
 
-int zfs_pd_blks_max = 100;
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
+boolean_t send_holes_without_birth_time = B_TRUE;
 
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
+    &send_holes_without_birth_time, 0, "Send holes without birth time");
+#endif
+
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
-	int pd_blks_max;
-	int pd_blks_fetched;
+	int32_t pd_bytes_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
+	zbookmark_phys_t pd_resume;
 } prefetch_data_t;
 
 typedef struct traverse_data {
@@ -55,11 +64,14 @@
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
-	zbookmark_t *td_resume;
+	zbookmark_phys_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
+	boolean_t td_paused;
+	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
+	boolean_t td_realloc_possible;
 } traverse_data_t;
 
 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
@@ -71,9 +83,9 @@
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 
-	if (bp->blk_birth == 0)
+	if (BP_IS_HOLE(bp))
 		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
@@ -95,9 +107,9 @@
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_t zb;
+		zbookmark_phys_t zb;
 
-		if (bp->blk_birth == 0)
+		if (BP_IS_HOLE(bp))
 			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
@@ -149,7 +161,7 @@
  */
 static resume_skip_t
 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
-    const zbookmark_t *zb)
+    const zbookmark_phys_t *zb)
 {
 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 		/*
@@ -156,7 +168,7 @@
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, td->td_resume))
+		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		/*
@@ -163,7 +175,6 @@
 		 * If we found the block we're trying to resume from, zero
 		 * the bookmark out to indicate that we have resumed.
 		 */
-		ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			bzero(td->td_resume, sizeof (*zb));
 			if (td->td_flags & TRAVERSE_POST)
@@ -174,18 +185,10 @@
 }
 
 static void
-traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
-{
-	ASSERT(td->td_resume != NULL);
-	ASSERT0(zb->zb_level);
-	bcopy(zb, td->td_resume, sizeof (*td->td_resume));
-}
-
-static void
 traverse_prefetch_metadata(traverse_data_t *td,
-    const blkptr_t *bp, const zbookmark_t *zb)
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
-	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return;
@@ -205,16 +208,25 @@
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 }
 
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
-    const blkptr_t *bp, const zbookmark_t *zb)
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
-	zbookmark_t czb;
-	int err = 0, lasterr = 0;
+	zbookmark_phys_t czb;
+	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
-	boolean_t pause = B_FALSE;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
@@ -227,39 +239,67 @@
 		ASSERT(0);
 	}
 
-	if (BP_IS_HOLE(bp)) {
-		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
-		return (err);
+	if (bp->blk_birth == 0) {
+		/*
+		 * Since this block has a birth time of 0 it must be one of
+		 * two things: a hole created before the
+		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+		 * which has always been a hole in an object.
+		 *
+		 * If a file is written sparsely, then the unwritten parts of
+		 * the file were "always holes" -- that is, they have been
+		 * holes since this object was allocated.  However, we (and
+		 * our callers) can not necessarily tell when an object was
+		 * allocated.  Therefore, if it's possible that this object
+		 * was freed and then its object number reused, we need to
+		 * visit all the holes with birth==0.
+		 *
+		 * If it isn't possible that the object number was reused,
+		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+		 * all the blocks we will visit as part of this traversal,
+		 * then this hole must have always existed, so we can skip
+		 * it.  We visit blocks born after (exclusive) td_min_txg.
+		 *
+		 * Note that the meta-dnode cannot be reallocated.
+		 */
+		if (!send_holes_without_birth_time &&
+		    (!td->td_realloc_possible ||
+		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
+		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
+			return (0);
+	} else if (bp->blk_birth <= td->td_min_txg) {
+		return (0);
 	}
 
-	if (bp->blk_birth <= td->td_min_txg)
-		return (0);
-
-	if (pd && !pd->pd_exited &&
-	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+		uint64_t size = BP_GET_LSIZE(bp);
 		mutex_enter(&pd->pd_mtx);
-		ASSERT(pd->pd_blks_fetched >= 0);
-		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
+		ASSERT(pd->pd_bytes_fetched >= 0);
+		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
-		pd->pd_blks_fetched--;
+		pd->pd_bytes_fetched -= size;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
+	if (BP_IS_HOLE(bp)) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+		if (err != 0)
+			goto post;
+		return (0);
+	}
+
 	if (td->td_flags & TRAVERSE_PRE) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
-		if (err == ERESTART)
-			pause = B_TRUE; /* handle pausing at a common point */
 		if (err != 0)
 			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@@ -267,7 +307,7 @@
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		cbp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
@@ -283,14 +323,11 @@
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
-			if (err != 0) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
@@ -297,84 +334,101 @@
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
-		dnp = buf->b_data;
+			goto post;
+		dnode_phys_t *child_dnp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
-			prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
-			    zb->zb_blkid * epb + i);
+			prefetch_dnode_metadata(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i++) {
-			err = traverse_dnode(td, &dnp[i], zb->zb_objset,
-			    zb->zb_blkid * epb + i);
-			if (err != 0) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+			err = traverse_dnode(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		uint32_t flags = ARC_WAIT;
-		objset_phys_t *osp;
-		dnode_phys_t *dnp;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 
-		osp = buf->b_data;
-		dnp = &osp->os_meta_dnode;
-		prefetch_dnode_metadata(td, dnp, zb->zb_objset,
+		objset_phys_t *osp = buf->b_data;
+		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
+		/*
+		 * See the block comment above for the goal of this variable.
+		 * If the maxblkid of the meta-dnode is 0, then we know that
+		 * we've never had more than DNODES_PER_BLOCK objects in the
+		 * dataset, which means we can't have reused any object ids.
+		 */
+		if (osp->os_meta_dnode.dn_maxblkid == 0)
+			td->td_realloc_possible = B_FALSE;
+
 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
-			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
-			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 
-		err = traverse_dnode(td, dnp, zb->zb_objset,
+		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
-		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			dnp = &osp->os_userused_dnode;
-			err = traverse_dnode(td, dnp, zb->zb_objset,
-			    DMU_USERUSED_OBJECT);
+			err = traverse_dnode(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
-		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			dnp = &osp->os_groupused_dnode;
-			err = traverse_dnode(td, dnp, zb->zb_objset,
-			    DMU_GROUPUSED_OBJECT);
+			err = traverse_dnode(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 
 post:
-	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		if (err == ERESTART)
-			pause = B_TRUE;
+
+	if (hard && (err == EIO || err == ECKSUM)) {
+		/*
+		 * Ignore this disk error as requested by the HARD flag,
+		 * and continue traversal.
+		 */
+		err = 0;
 	}
 
-	if (pause && td->td_resume != NULL) {
-		ASSERT3U(err, ==, ERESTART);
-		ASSERT(!hard);
-		traverse_pause(td, zb);
+	/*
+	 * If we are stopping here, set td_resume.
+	 */
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+		td->td_resume->zb_objset = zb->zb_objset;
+		td->td_resume->zb_object = zb->zb_object;
+		td->td_resume->zb_level = 0;
+		/*
+		 * If we have stopped on an indirect block (e.g. due to
+		 * i/o error), we have not visited anything below it.
+		 * Set the bookmark to the first level-0 block that we need
+		 * to visit.  This way, the resuming code does not need to
+		 * deal with resuming from indirect blocks.
+		 *
+		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+		 * to dereference it.
+		 */
+		td->td_resume->zb_blkid = zb->zb_blkid;
+		if (zb->zb_level > 0) {
+			td->td_resume->zb_blkid <<= zb->zb_level *
+			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+		}
+		td->td_paused = B_TRUE;
 	}
 
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 
 static void
@@ -382,7 +436,7 @@
     uint64_t objset, uint64_t object)
 {
 	int j;
-	zbookmark_t czb;
+	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
@@ -399,53 +453,70 @@
 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
-	int j, err = 0, lasterr = 0;
-	zbookmark_t czb;
-	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
+	int j, err = 0;
+	zbookmark_phys_t czb;
 
+	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+	    object < td->td_resume->zb_object)
+		return (0);
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-		if (err != 0) {
-			if (!hard)
-				break;
-			lasterr = err;
-		}
+		if (err != 0)
+			break;
 	}
 
-	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
-		if (err != 0) {
-			if (!hard)
-				return (err);
-			lasterr = err;
-		}
 	}
-	return (err != 0 ? err : lasterr);
+
+	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+	return (err);
 }
 
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	prefetch_data_t *pfd = arg;
-	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
-	ASSERT(pfd->pd_blks_fetched >= 0);
+	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (bp == NULL)
+		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
-	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
-	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
-	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
+	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
-	pfd->pd_blks_fetched++;
+	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
@@ -460,11 +531,12 @@
 {
 	traverse_data_t *td_main = arg;
 	traverse_data_t td = *td_main;
-	zbookmark_t czb;
+	zbookmark_phys_t czb;
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
+	td.td_resume = &td_main->td_pfd->pd_resume;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@@ -482,23 +554,17 @@
  */
 static int
 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
-    uint64_t txg_start, zbookmark_t *resume, int flags,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	traverse_data_t td;
 	prefetch_data_t pd = { 0 };
-	zbookmark_t czb;
+	zbookmark_phys_t czb;
 	int err;
 
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 
-	/*
-	 * The data prefetching mechanism (the prefetch thread) is incompatible
-	 * with resuming from a bookmark.
-	 */
-	ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
-
 	td.td_spa = spa;
 	td.td_objset = objset;
 	td.td_rootbp = rootbp;
@@ -508,15 +574,25 @@
 	td.td_arg = arg;
 	td.td_pfd = &pd;
 	td.td_flags = flags;
+	td.td_paused = B_FALSE;
+	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 
-	pd.pd_blks_max = zfs_pd_blks_max;
+	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+		VERIFY(spa_feature_enabled_txg(spa,
+		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+	} else {
+		td.td_hole_birth_enabled_txg = UINT64_MAX;
+	}
+
 	pd.pd_flags = flags;
+	if (resume != NULL)
+		pd.pd_resume = *resume;
 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
-	if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
-		uint32_t flags = ARC_WAIT;
+	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
@@ -528,7 +604,7 @@
 
 		osp = buf->b_data;
 		traverse_zil(&td, &osp->os_zil_header);
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
@@ -558,16 +634,24 @@
  * in syncing context).
  */
 int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
-    blkptr_cb_t func, void *arg)
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume,
+    int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-	    &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
 }
 
 int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
-    uint64_t txg_start, zbookmark_t *resume, int flags,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
@@ -581,8 +665,7 @@
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-	int err, lasterr = 0;
-	uint64_t obj;
+	int err;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
@@ -594,19 +677,18 @@
 		return (err);
 
 	/* visit each dataset */
-	for (obj = 1; err == 0 || (err != ESRCH && hard);
-	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+	for (uint64_t obj = 1; err == 0;
+	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
-			if (!hard)
-				return (err);
-			lasterr = err;
-			continue;
+			if (hard)
+				continue;
+			break;
 		}
 
-		if (doi.doi_type == DMU_OT_DSL_DATASET) {
+		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
@@ -614,23 +696,19 @@
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-				continue;
+				if (hard)
+					continue;
+				break;
 			}
-			if (ds->ds_phys->ds_prev_snap_txg > txg)
-				txg = ds->ds_phys->ds_prev_snap_txg;
+			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err != 0) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err != 0 ? err : lasterr);
+	return (err);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
@@ -55,6 +56,7 @@
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
+	tx->tx_start = gethrtime();
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
@@ -128,6 +130,12 @@
 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
+	refcount_create(&txh->txh_space_towrite);
+	refcount_create(&txh->txh_space_tofree);
+	refcount_create(&txh->txh_space_tooverwrite);
+	refcount_create(&txh->txh_space_tounref);
+	refcount_create(&txh->txh_memory_tohold);
+	refcount_create(&txh->txh_fudge);
 #ifdef ZFS_DEBUG
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
@@ -200,13 +208,19 @@
 	freeable = (bp && (freeable ||
 	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 
-	if (freeable)
-		txh->txh_space_tooverwrite += space;
-	else
-		txh->txh_space_towrite += space;
-	if (bp)
-		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+	if (freeable) {
+		(void) refcount_add_many(&txh->txh_space_tooverwrite,
+		    space, FTAG);
+	} else {
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    space, FTAG);
+	}
 
+	if (bp) {
+		(void) refcount_add_many(&txh->txh_space_tounref,
+		    bp_get_dsize(os->os_spa, bp), FTAG);
+	}
+
 	dmu_tx_count_twig(txh, dn, parent, level + 1,
 	    blkid >> epbs, freeable, history);
 }
@@ -224,7 +238,7 @@
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = SPA_MAXBLOCKSHIFT;
+	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -293,6 +307,14 @@
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
+		} else {
+			/*
+			 * The blocksize can increase up to the recordsize,
+			 * or if it is already more than the recordsize,
+			 * up to the next power of 2.
+			 */
+			min_bs = highbit64(dn->dn_datablksz - 1);
+			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
@@ -307,7 +329,8 @@
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+			err = dbuf_hold_impl(dn, 0, start,
+			    FALSE, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
 
 			if (err) {
@@ -326,8 +349,11 @@
 				bits = 64 - min_bs;
 				epbs = min_ibs - SPA_BLKPTRSHIFT;
 				for (bits -= epbs * (nlvls - 1);
-				    bits >= 0; bits -= epbs)
-					txh->txh_fudge += 1ULL << max_ibs;
+				    bits >= 0; bits -= epbs) {
+					(void) refcount_add_many(
+					    &txh->txh_fudge,
+					    1ULL << max_ibs, FTAG);
+				}
 				goto out;
 			}
 			off += delta;
@@ -343,7 +369,8 @@
 	 */
 	start = P2ALIGN(off, 1ULL << max_bs);
 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
-	txh->txh_space_towrite += end - start + 1;
+	(void) refcount_add_many(&txh->txh_space_towrite,
+	    end - start + 1, FTAG);
 
 	start >>= min_bs;
 	end >>= min_bs;
@@ -358,18 +385,21 @@
 		start >>= epbs;
 		end >>= epbs;
 		ASSERT3U(end, >=, start);
-		txh->txh_space_towrite += (end - start + 1) << max_ibs;
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    (end - start + 1) << max_ibs, FTAG);
 		if (start != 0) {
 			/*
 			 * We also need a new blkid=0 indirect block
 			 * to reference any existing file data.
 			 */
-			txh->txh_space_towrite += 1ULL << max_ibs;
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    1ULL << max_ibs, FTAG);
 		}
 	}
 
 out:
-	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+	if (refcount_count(&txh->txh_space_towrite) +
+	    refcount_count(&txh->txh_space_tooverwrite) >
 	    2 * DMU_MAX_ACCESS)
 		err = SET_ERROR(EFBIG);
 
@@ -388,12 +418,15 @@
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
-		txh->txh_space_tooverwrite += space;
-		txh->txh_space_tounref += space;
+		(void) refcount_add_many(&txh->txh_space_tooverwrite,
+		    space, FTAG);
+		(void) refcount_add_many(&txh->txh_space_tounref, space, FTAG);
 	} else {
-		txh->txh_space_towrite += space;
-		if (dn && dn->dn_dbuf->db_blkptr)
-			txh->txh_space_tounref += space;
+		(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
+		if (dn && dn->dn_dbuf->db_blkptr) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    space, FTAG);
+		}
 	}
 }
 
@@ -449,12 +482,12 @@
 		blkid = off >> dn->dn_datablkshift;
 		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 
-		if (blkid >= dn->dn_maxblkid) {
+		if (blkid > dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 		if (blkid + nblks > dn->dn_maxblkid)
-			nblks = dn->dn_maxblkid - blkid;
+			nblks = dn->dn_maxblkid - blkid + 1;
 
 	}
 	l0span = nblks;    /* save for later use to calc level > 1 overhead */
@@ -508,13 +541,15 @@
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+		    FALSE, FALSE, FTAG, &dbuf);
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
 		}
 
-		txh->txh_memory_tohold += dbuf->db.db_size;
+		(void) refcount_add_many(&txh->txh_memory_tohold,
+		    dbuf->db.db_size, FTAG);
 
 		/*
 		 * We don't check memory_tohold against DMU_MAX_ACCESS because
@@ -567,8 +602,9 @@
 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		while (level++ < maxlevel) {
-			txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
-			    << dn->dn_indblkshift;
+			(void) refcount_add_many(&txh->txh_memory_tohold,
+			    MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift,
+			    FTAG);
 			blkcnt = 1 + (blkcnt >> epbs);
 		}
 	}
@@ -575,21 +611,51 @@
 
 	/* account for new level 1 indirect blocks that might show up */
 	if (skipped > 0) {
-		txh->txh_fudge += skipped << dn->dn_indblkshift;
+		(void) refcount_add_many(&txh->txh_fudge,
+		    skipped << dn->dn_indblkshift, FTAG);
 		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
-		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+		(void) refcount_add_many(&txh->txh_memory_tohold,
+		    skipped << dn->dn_indblkshift, FTAG);
 	}
-	txh->txh_space_tofree += space;
-	txh->txh_space_tounref += unref;
+	(void) refcount_add_many(&txh->txh_space_tofree, space, FTAG);
+	(void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG);
 }
 
+/*
+ * This function marks the transaction as being a "net free".  The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()).  Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
 void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    DMU_NEW_OBJECT, THT_FREE, 0, 0);
+
+	/*
+	 * Pretend that this operation will free 1GB of space.  This
+	 * should be large enough to cancel out the largest write.
+	 * We don't want to use something like UINT64_MAX, because that would
+	 * cause overflows when doing math with these values (e.g. in
+	 * dmu_tx_try_assign()).
+	 */
+	(void) refcount_add_many(&txh->txh_space_tofree,
+	    1024 * 1024 * 1024, FTAG);
+	(void) refcount_add_many(&txh->txh_space_tounref,
+	    1024 * 1024 * 1024, FTAG);
+}
+
+void
 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
-	uint64_t start, end, i;
-	int err, shift;
+	int err;
 	zio_t *zio;
 
 	ASSERT(tx->tx_txg == 0);
@@ -599,14 +665,6 @@
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
-
-	/* first block */
-	if (off != 0)
-		dmu_tx_count_write(txh, off, 1);
-	/* last block */
-	if (len != DMU_OBJECT_END)
-		dmu_tx_count_write(txh, off+len, 1);
-
 	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
@@ -614,24 +672,54 @@
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
+
 	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks, and all the level-1 blocks.  The above count_write's
-	 * have already taken care of the level-0 blocks.
+	 * For i/o error checking, we read the first and last level-0
+	 * blocks if they are not aligned, and all the level-1 blocks.
+	 *
+	 * Note:  dbuf_free_range() assumes that we have not instantiated
+	 * any level-0 dbufs that will be completely freed.  Therefore we must
+	 * exercise care to not read or count the first and last blocks
+	 * if they are blocksize-aligned.
 	 */
+	if (dn->dn_datablkshift == 0) {
+		if (off != 0 || len < dn->dn_datablksz)
+			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+	} else {
+		/* first block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off, 1);
+		/* last block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off+len, 1);
+	}
+
+	/*
+	 * Check level-1 blocks.
+	 */
 	if (dn->dn_nlevels > 1) {
-		shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
-		start = off >> shift;
-		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+		uint64_t start = off >> shift;
+		uint64_t end = (off + len) >> shift;
 
+		ASSERT(dn->dn_indblkshift != 0);
+
+		/*
+		 * dnode_reallocate() can result in an object with indirect
+		 * blocks having an odd data block size.  In this case,
+		 * just check the single block.
+		 */
+		if (dn->dn_datablkshift == 0)
+			start = end = 0;
+
 		zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
-		for (i = start; i <= end; i++) {
+		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH)
+			if (err == ESRCH || i > end)
 				break;
 			if (err) {
 				tx->tx_err = err;
@@ -659,8 +747,7 @@
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
-	uint64_t nblocks;
-	int epbs, err;
+	int err;
 
 	ASSERT(tx->tx_txg == 0);
 
@@ -703,12 +790,17 @@
 		 */
 		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
-		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
-		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+		    bp, bp->blk_birth)) {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		} else {
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		}
+		if (!BP_IS_HOLE(bp)) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    MZAP_MAX_BLKSZ, FTAG);
+		}
 		return;
 	}
 
@@ -717,8 +809,7 @@
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
-		    8, 0, NULL);
+		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
@@ -725,19 +816,34 @@
 		}
 	}
 
-	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	err = zap_count_write_by_dnode(dn, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
-	 * we'll have to modify an indirect twig for each.
+	 * we'll have to modify an indirect twig for each.  We can make
+	 * modifications at up to 3 locations:
+	 *  - header block at the beginning of the object
+	 *  - target leaf block
+	 *  - end of the object, where we might need to write:
+	 *	- a new leaf block if the target block needs to be split
+	 *	- the new pointer table, if it is growing
+	 *	- the new cookie table, if it is growing
 	 */
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
-			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
-		else
-			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dsl_dataset_phys_t *ds_phys =
+	    dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
+	for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) {
+		uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl));
+		uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift;
+		if (ds_phys->ds_prev_snap_obj != 0) {
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    spc, FTAG);
+		} else {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    spc, FTAG);
+		}
+	}
 }
 
 void
@@ -762,7 +868,7 @@
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
 
-	txh->txh_space_towrite += space;
+	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 }
 
 int
@@ -898,8 +1004,163 @@
 }
 #endif
 
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	hrtime_t wakeup, min_tx_time, now;
+
+	if (dirty <= delay_min_bytes)
+		return;
+
+	/*
+	 * The caller has already waited until we are under the max.
+	 * We make them pass us the amount of dirty data so we don't
+	 * have to handle the case of it being >= the max, which could
+	 * cause a divide-by-zero if it's == the max.
+	 */
+	ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+	now = gethrtime();
+	min_tx_time = zfs_delay_scale *
+	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	if (now > tx->tx_start + min_tx_time)
+		return;
+
+	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+	    uint64_t, min_tx_time);
+
+	mutex_enter(&dp->dp_lock);
+	wakeup = MAX(tx->tx_start + min_tx_time,
+	    dp->dp_last_wakeup + min_tx_time);
+	dp->dp_last_wakeup = wakeup;
+	mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+	mutex_enter(&curthread->t_delay_lock);
+	while (cv_timedwait_hires(&curthread->t_delay_cv,
+	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+		continue;
+	mutex_exit(&curthread->t_delay_lock);
+#else
+	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
+	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
+#endif
+#else
+	hrtime_t delta = wakeup - gethrtime();
+	struct timespec ts;
+	ts.tv_sec = delta / NANOSEC;
+	ts.tv_nsec = delta % NANOSEC;
+	(void) nanosleep(&ts, NULL);
+#endif
+}
+
 static int
-dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
@@ -922,12 +1183,18 @@
 		 * of the failuremode setting.
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
-		    txg_how != TXG_WAIT)
+		    !(txg_how & TXG_WAIT))
 			return (SET_ERROR(EIO));
 
 		return (SET_ERROR(ERESTART));
 	}
 
+	if (!tx->tx_dirty_delayed &&
+	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		return (SET_ERROR(ERESTART));
+	}
+
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
@@ -954,12 +1221,12 @@
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
-		towrite += txh->txh_space_towrite;
-		tofree += txh->txh_space_tofree;
-		tooverwrite += txh->txh_space_tooverwrite;
-		tounref += txh->txh_space_tounref;
-		tohold += txh->txh_memory_tohold;
-		fudge += txh->txh_fudge;
+		towrite += refcount_count(&txh->txh_space_towrite);
+		tofree += refcount_count(&txh->txh_space_tofree);
+		tooverwrite += refcount_count(&txh->txh_space_tooverwrite);
+		tounref += refcount_count(&txh->txh_space_tounref);
+		tohold += refcount_count(&txh->txh_memory_tohold);
+		fudge += refcount_count(&txh->txh_fudge);
 	}
 
 	/*
@@ -1042,33 +1309,44 @@
 }
 
 /*
- * Assign tx to a transaction group.  txg_how can be one of:
+ * Assign tx to a transaction group; txg_how is a bitmask:
  *
- * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
- *	a new one.  This should be used when you're not holding locks.
- *	It will only fail if we're truly out of space (or over quota).
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
  *
- * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
- *	blocking, returns immediately with ERESTART.  This should be used
- *	whenever you're holding locks.  On an ERESTART error, the caller
- *	should drop locks, do a dmu_tx_wait(tx), and try again.
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held.  On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
+ *
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
  */
 int
-dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
+	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
 	/* If we might wait, we must not hold the config lock. */
-	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
 
+	if ((txg_how & TXG_NOTHROTTLE))
+		tx->tx_dirty_delayed = B_TRUE;
+
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
-		if (err != ERESTART || txg_how != TXG_WAIT)
+		if (err != ERESTART || !(txg_how & TXG_WAIT))
 			return (err);
 
 		dmu_tx_wait(tx);
@@ -1083,18 +1361,48 @@
 dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
+	dsl_pool_t *dp = tx->tx_pool;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
-	/*
-	 * It's possible that the pool has become active after this thread
-	 * has tried to obtain a tx. If that's the case then his
-	 * tx_lasttried_txg would not have been assigned.
-	 */
-	if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
-		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+	if (tx->tx_wait_dirty) {
+		/*
+		 * dmu_tx_try_assign() has determined that we need to wait
+		 * because we've consumed much or all of the dirty buffer
+		 * space.
+		 */
+		mutex_enter(&dp->dp_lock);
+		while (dp->dp_dirty_total >= zfs_dirty_data_max)
+			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+		uint64_t dirty = dp->dp_dirty_total;
+		mutex_exit(&dp->dp_lock);
+
+		dmu_tx_delay(tx, dirty);
+
+		tx->tx_wait_dirty = B_FALSE;
+
+		/*
+		 * Note: setting tx_dirty_delayed only has effect if the
+		 * caller used TX_WAIT.  Otherwise they are going to
+		 * destroy this tx and try again.  The common case,
+		 * zfs_write(), uses TX_WAIT.
+		 */
+		tx->tx_dirty_delayed = B_TRUE;
+	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+		/*
+		 * If the pool is suspended we need to wait until it
+		 * is resumed.  Note that it's possible that the pool
+		 * has become active after this thread has tried to
+		 * obtain a tx.  If that's the case then tx_lasttried_txg
+		 * would not have been set.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	} else if (tx->tx_needassign_txh) {
+		/*
+		 * A dnode is assigned to the quiescing txg.  Wait for its
+		 * transaction to complete.
+		 */
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
@@ -1124,11 +1432,46 @@
 #endif
 }
 
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	while ((txh = list_head(&tx->tx_holds)) != NULL) {
+		dnode_t *dn = txh->txh_dnode;
+
+		list_remove(&tx->tx_holds, txh);
+		refcount_destroy_many(&txh->txh_space_towrite,
+		    refcount_count(&txh->txh_space_towrite));
+		refcount_destroy_many(&txh->txh_space_tofree,
+		    refcount_count(&txh->txh_space_tofree));
+		refcount_destroy_many(&txh->txh_space_tooverwrite,
+		    refcount_count(&txh->txh_space_tooverwrite));
+		refcount_destroy_many(&txh->txh_space_tounref,
+		    refcount_count(&txh->txh_space_tounref));
+		refcount_destroy_many(&txh->txh_memory_tohold,
+		    refcount_count(&txh->txh_memory_tohold));
+		refcount_destroy_many(&txh->txh_fudge,
+		    refcount_count(&txh->txh_fudge));
+		kmem_free(txh, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+
+	list_destroy(&tx->tx_callbacks);
+	list_destroy(&tx->tx_holds);
+#ifdef ZFS_DEBUG
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
 void
 dmu_tx_commit(dmu_tx_t *tx)
 {
-	dmu_tx_hold_t *txh;
-
 	ASSERT(tx->tx_txg != 0);
 
 	/*
@@ -1135,13 +1478,13 @@
 	 * Go through the transaction's hold list and remove holds on
 	 * associated dnodes, notifying waiters if no holds remain.
 	 */
-	while (txh = list_head(&tx->tx_holds)) {
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 
-		list_remove(&tx->tx_holds, txh);
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn == NULL)
 			continue;
+
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
@@ -1150,7 +1493,6 @@
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
-		dnode_rele(dn, tx);
 	}
 
 	if (tx->tx_tempreserve_cookie)
@@ -1162,36 +1504,19 @@
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
 
-	list_destroy(&tx->tx_callbacks);
-	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
 #endif
-	kmem_free(tx, sizeof (dmu_tx_t));
+	dmu_tx_destroy(tx);
 }
 
 void
 dmu_tx_abort(dmu_tx_t *tx)
 {
-	dmu_tx_hold_t *txh;
-
 	ASSERT(tx->tx_txg == 0);
 
-	while (txh = list_head(&tx->tx_holds)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		list_remove(&tx->tx_holds, txh);
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
-		if (dn != NULL)
-			dnode_rele(dn, tx);
-	}
-
 	/*
 	 * Call any registered callbacks with an error code.
 	 */
@@ -1198,15 +1523,7 @@
 	if (!list_is_empty(&tx->tx_callbacks))
 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
 
-	list_destroy(&tx->tx_callbacks);
-	list_destroy(&tx->tx_holds);
-#ifdef ZFS_DEBUG
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
-#endif
-	kmem_free(tx, sizeof (dmu_tx_t));
+	dmu_tx_destroy(tx);
 }
 
 uint64_t
@@ -1245,7 +1562,7 @@
 {
 	dmu_tx_callback_t *dcb;
 
-	while (dcb = list_head(cb_list)) {
+	while ((dcb = list_head(cb_list)) != NULL) {
 		list_remove(cb_list, dcb);
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
@@ -1303,18 +1620,24 @@
 
 	/* If blkptr doesn't exist then add space to towrite */
 	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-		txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    SPA_OLD_MAXBLOCKSIZE, FTAG);
 	} else {
 		blkptr_t *bp;
 
 		bp = &dn->dn_phys->dn_spill;
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
-		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
-		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+		    bp, bp->blk_birth)) {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		} else {
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		}
+		if (!BP_IS_HOLE(bp)) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		}
 	}
 }
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
@@ -33,19 +37,22 @@
 #include <sys/kstat.h>
 
 /*
- * I'm against tune-ables, but these should probably exist as tweakable globals
- * until we can get this working the way we want it to.
+ * This tunable disables predictive prefetch.  Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
  */
+boolean_t zfs_prefetch_disable = B_FALSE;
 
-int zfs_prefetch_disable = 0;
-
 /* max # of streams per zfetch */
 uint32_t	zfetch_max_streams = 8;
 /* min time before stream reclaim */
 uint32_t	zfetch_min_sec_reap = 2;
-/* max number of blocks to fetch at a time */
-uint32_t	zfetch_block_cap = 256;
-/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+/* max bytes to prefetch per stream (default 8MB) */
+uint32_t	zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+uint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
 
 SYSCTL_DECL(_vfs_zfs);
@@ -56,202 +63,36 @@
 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
     &zfetch_max_streams, 0, "Max # of streams per zfetch");
 TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
     &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
-TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
-    &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
+TUNABLE_INT("vfs.zfs.zfetch.max_distance", &zfetch_max_distance);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+    &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
 TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
-SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
+SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
     &zfetch_array_rd_sz, 0,
     "Number of bytes in a array_read at which we stop prefetching");
 
-/* forward decls for static routines */
-static boolean_t	dmu_zfetch_colinear(zfetch_t *, zstream_t *);
-static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
-static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
-static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static boolean_t	dmu_zfetch_find(zfetch_t *, zstream_t *, int);
-static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
-static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
-static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
-static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
-
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
 	kstat_named_t zfetchstat_misses;
-	kstat_named_t zfetchstat_colinear_hits;
-	kstat_named_t zfetchstat_colinear_misses;
-	kstat_named_t zfetchstat_stride_hits;
-	kstat_named_t zfetchstat_stride_misses;
-	kstat_named_t zfetchstat_reclaim_successes;
-	kstat_named_t zfetchstat_reclaim_failures;
-	kstat_named_t zfetchstat_stream_resets;
-	kstat_named_t zfetchstat_stream_noresets;
-	kstat_named_t zfetchstat_bogus_streams;
+	kstat_named_t zfetchstat_max_streams;
 } zfetch_stats_t;
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
-	{ "colinear_hits",		KSTAT_DATA_UINT64 },
-	{ "colinear_misses",		KSTAT_DATA_UINT64 },
-	{ "stride_hits",		KSTAT_DATA_UINT64 },
-	{ "stride_misses",		KSTAT_DATA_UINT64 },
-	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
-	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
-	{ "streams_resets",		KSTAT_DATA_UINT64 },
-	{ "streams_noresets",		KSTAT_DATA_UINT64 },
-	{ "bogus_streams",		KSTAT_DATA_UINT64 },
+	{ "max_streams",		KSTAT_DATA_UINT64 },
 };
 
-#define	ZFETCHSTAT_INCR(stat, val) \
-	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+#define	ZFETCHSTAT_BUMP(stat) \
+	atomic_inc_64(&zfetch_stats.stat.value.ui64);
 
-#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
-
 kstat_t		*zfetch_ksp;
 
-/*
- * Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear pair of existing prefetch
- * streams.  If a set is found, coalesce the streams, removing one, and
- * configure the prefetch so it looks for a strided access pattern.
- *
- * In other words: if we find two sequential access streams that are
- * the same length and distance N appart, and this read is N from the
- * last stream, then we are probably in a strided access pattern.  So
- * combine the two sequential streams into a single strided stream.
- *
- * Returns whether co-linear streams were found.
- */
-static boolean_t
-dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
-{
-	zstream_t	*z_walk;
-	zstream_t	*z_comp;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	if (zh == NULL) {
-		rw_exit(&zf->zf_rwlock);
-		return (0);
-	}
-
-	for (z_walk = list_head(&zf->zf_stream); z_walk;
-	    z_walk = list_next(&zf->zf_stream, z_walk)) {
-		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
-		    z_comp = list_next(&zf->zf_stream, z_comp)) {
-			int64_t		diff;
-
-			if (z_walk->zst_len != z_walk->zst_stride ||
-			    z_comp->zst_len != z_comp->zst_stride) {
-				continue;
-			}
-
-			diff = z_comp->zst_offset - z_walk->zst_offset;
-			if (z_comp->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-
-			diff = z_walk->zst_offset - z_comp->zst_offset;
-			if (z_walk->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-		}
-	}
-
-	rw_exit(&zf->zf_rwlock);
-	return (0);
-}
-
-/*
- * Given a zstream_t, determine the bounds of the prefetch.  Then call the
- * routine that actually prefetches the individual blocks.
- */
-static void
-dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
-{
-	uint64_t	prefetch_tail;
-	uint64_t	prefetch_limit;
-	uint64_t	prefetch_ofst;
-	uint64_t	prefetch_len;
-	uint64_t	blocks_fetched;
-
-	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
-	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
-
-	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
-	    (int64_t)(zs->zst_offset + zs->zst_stride));
-	/*
-	 * XXX: use a faster division method?
-	 */
-	prefetch_limit = zs->zst_offset + zs->zst_len +
-	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
-
-	while (prefetch_tail < prefetch_limit) {
-		prefetch_ofst = zs->zst_offset + zs->zst_direction *
-		    (prefetch_tail - zs->zst_offset);
-
-		prefetch_len = zs->zst_len;
-
-		/*
-		 * Don't prefetch beyond the end of the file, if working
-		 * backwards.
-		 */
-		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
-		    (prefetch_ofst > prefetch_tail)) {
-			prefetch_len += prefetch_ofst;
-			prefetch_ofst = 0;
-		}
-
-		/* don't prefetch more than we're supposed to */
-		if (prefetch_len > zs->zst_len)
-			break;
-
-		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
-		    prefetch_ofst, zs->zst_len);
-
-		prefetch_tail += zs->zst_stride;
-		/* stop if we've run out of stuff to prefetch */
-		if (blocks_fetched < zs->zst_len)
-			break;
-	}
-	zs->zst_ph_offset = prefetch_tail;
-	zs->zst_last = ddi_get_lbolt();
-}
-
 void
 zfetch_init(void)
 {
-
 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
@@ -279,284 +120,41 @@
 void
 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
 {
-	if (zf == NULL) {
+	if (zf == NULL)
 		return;
-	}
 
 	zf->zf_dnode = dno;
-	zf->zf_stream_cnt = 0;
-	zf->zf_alloc_fail = 0;
 
 	list_create(&zf->zf_stream, sizeof (zstream_t),
-	    offsetof(zstream_t, zst_node));
+	    offsetof(zstream_t, zs_node));
 
 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
 }
 
-/*
- * This function computes the actual size, in blocks, that can be prefetched,
- * and fetches it.
- */
-static uint64_t
-dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
 {
-	uint64_t	fetchsz;
-	uint64_t	i;
-
-	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
-
-	for (i = 0; i < fetchsz; i++) {
-		dbuf_prefetch(dn, blkid + i);
-	}
-
-	return (fetchsz);
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+	list_remove(&zf->zf_stream, zs);
+	mutex_destroy(&zs->zs_lock);
+	kmem_free(zs, sizeof (*zs));
 }
 
 /*
- * this function returns the number of blocks that would be prefetched, based
- * upon the supplied dnode, blockid, and nblks.  This is used so that we can
- * update streams in place, and then prefetch with their old value after the
- * fact.  This way, we can delay the prefetch, but subsequent accesses to the
- * stream won't result in the same data being prefetched multiple times.
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
  */
-static uint64_t
-dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
-	uint64_t	fetchsz;
-
-	if (blkid > dn->dn_maxblkid) {
-		return (0);
-	}
-
-	/* compute fetch size */
-	if (blkid + nblks + 1 > dn->dn_maxblkid) {
-		fetchsz = (dn->dn_maxblkid - blkid) + 1;
-		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
-	} else {
-		fetchsz = nblks;
-	}
-
-
-	return (fetchsz);
-}
-
-/*
- * given a zfetch and a zstream structure, see if there is an associated zstream
- * for this block read.  If so, it starts a prefetch for the stream it
- * located and returns true, otherwise it returns false
- */
-static boolean_t
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
-{
-	zstream_t	*zs;
-	int64_t		diff;
-	int		reset = !prefetched;
-	int		rc = 0;
-
-	if (zh == NULL)
-		return (0);
-
-	/*
-	 * XXX: This locking strategy is a bit coarse; however, it's impact has
-	 * yet to be tested.  If this turns out to be an issue, it can be
-	 * modified in a number of different ways.
-	 */
-
-	rw_enter(&zf->zf_rwlock, RW_READER);
-top:
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		/*
-		 * XXX - should this be an assert?
-		 */
-		if (zs->zst_len == 0) {
-			/* bogus stream */
-			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
-			continue;
-		}
-
-		/*
-		 * We hit this case when we are in a strided prefetch stream:
-		 * we will read "len" blocks before "striding".
-		 */
-		if (zh->zst_offset >= zs->zst_offset &&
-		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
-			if (prefetched) {
-				/* already fetched */
-				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
-				rc = 1;
-				goto out;
-			} else {
-				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
-			}
-		}
-
-		/*
-		 * This is the forward sequential read case: we increment
-		 * len by one each time we hit here, so we will enter this
-		 * case on every read.
-		 */
-		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			if (mutex_tryenter(&zs->zst_lock) == 0) {
-				rc = 1;
-				goto out;
-			}
-
-			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-			zs->zst_len += zh->zst_len;
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_offset += diff;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : 0;
-			}
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		/*
-		 * Same as above, but reading backwards through the file.
-		 */
-		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
-			/* backwards sequential access */
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			if (mutex_tryenter(&zs->zst_lock) == 0) {
-				rc = 1;
-				goto out;
-			}
-
-			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zh->zst_len ?
-			    zs->zst_offset - zh->zst_len : 0;
-			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
-			    zs->zst_ph_offset - zh->zst_len : 0;
-			zs->zst_len += zh->zst_len;
-
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
-				    zs->zst_ph_offset - diff : 0;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : zs->zst_len;
-			}
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided forward access */
-
-			if (mutex_tryenter(&zs->zst_lock) == 0) {
-				rc = 1;
-				goto out;
-			}
-
-			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset += zs->zst_stride;
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided reverse access */
-
-			if (mutex_tryenter(&zs->zst_lock) == 0) {
-				rc = 1;
-				goto out;
-			}
-
-			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
-			    zs->zst_offset - zs->zst_stride : 0;
-			zs->zst_ph_offset = (zs->zst_ph_offset >
-			    (2 * zs->zst_stride)) ?
-			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-		}
-	}
-
-	if (zs) {
-		if (reset) {
-			zstream_t *remove = zs;
-
-			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
-			rc = 0;
-			mutex_exit(&zs->zst_lock);
-			rw_exit(&zf->zf_rwlock);
-			rw_enter(&zf->zf_rwlock, RW_WRITER);
-			/*
-			 * Relocate the stream, in case someone removes
-			 * it while we were acquiring the WRITER lock.
-			 */
-			for (zs = list_head(&zf->zf_stream); zs;
-			    zs = list_next(&zf->zf_stream, zs)) {
-				if (zs == remove) {
-					dmu_zfetch_stream_remove(zf, zs);
-					mutex_destroy(&zs->zst_lock);
-					kmem_free(zs, sizeof (zstream_t));
-					break;
-				}
-			}
-		} else {
-			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
-			rc = 1;
-			dmu_zfetch_dofetch(zf, zs);
-			mutex_exit(&zs->zst_lock);
-		}
-	}
-out:
-	rw_exit(&zf->zf_rwlock);
-	return (rc);
-}
-
-/*
- * Clean-up state associated with a zfetch structure.  This frees allocated
- * structure members, empties the zf_stream tree, and generally makes things
- * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
- */
 void
-dmu_zfetch_rele(zfetch_t *zf)
+dmu_zfetch_fini(zfetch_t *zf)
 {
-	zstream_t	*zs;
-	zstream_t	*zs_next;
+	zstream_t *zs;
 
 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
 
-	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs);
-
-		list_remove(&zf->zf_stream, zs);
-		mutex_destroy(&zs->zst_lock);
-		kmem_free(zs, sizeof (zstream_t));
-	}
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+	while ((zs = list_head(&zf->zf_stream)) != NULL)
+		dmu_zfetch_stream_remove(zf, zs);
+	rw_exit(&zf->zf_rwlock);
 	list_destroy(&zf->zf_stream);
 	rw_destroy(&zf->zf_rwlock);
 
@@ -564,193 +162,190 @@
 }
 
 /*
- * Given a zfetch and zstream structure, insert the zstream structure into the
- * AVL tree contained within the zfetch structure.  Peform the appropriate
- * book-keeping.  It is possible that another thread has inserted a stream which
- * matches one that we are about to insert, so we must be sure to check for this
- * case.  If one is found, return failure, and let the caller cleanup the
- * duplicates.
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
  */
-static int
-dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
-	zstream_t	*zs_walk;
-	zstream_t	*zs_next;
+	zstream_t *zs_next;
+	int numstreams = 0;
 
 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
 
-	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs_walk);
-
-		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
-			return (0);
-		}
+	/*
+	 * Clean up old streams.
+	 */
+	for (zstream_t *zs = list_head(&zf->zf_stream);
+	    zs != NULL; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+		    zfetch_min_sec_reap)
+			dmu_zfetch_stream_remove(zf, zs);
+		else
+			numstreams++;
 	}
 
-	list_insert_head(&zf->zf_stream, zs);
-	zf->zf_stream_cnt++;
-	return (1);
-}
-
-
-/*
- * Walk the list of zstreams in the given zfetch, find an old one (by time), and
- * reclaim it for use by the caller.
- */
-static zstream_t *
-dmu_zfetch_stream_reclaim(zfetch_t *zf)
-{
-	zstream_t	*zs;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
-			break;
+	/*
+	 * The maximum number of streams is normally zfetch_max_streams,
+	 * but for small files we lower it such that it's at least possible
+	 * for all the streams to be non-overlapping.
+	 *
+	 * If we are already at the maximum number of streams for this file,
+	 * even after removing old streams, then don't create this stream.
+	 */
+	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    zfetch_max_distance));
+	if (numstreams >= max_streams) {
+		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+		return;
 	}
 
-	if (zs) {
-		dmu_zfetch_stream_remove(zf, zs);
-		mutex_destroy(&zs->zst_lock);
-		bzero(zs, sizeof (zstream_t));
-	} else {
-		zf->zf_alloc_fail++;
-	}
-	rw_exit(&zf->zf_rwlock);
+	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+	zs->zs_blkid = blkid;
+	zs->zs_pf_blkid = blkid;
+	zs->zs_ipf_blkid = blkid;
+	zs->zs_atime = gethrtime();
+	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	return (zs);
+	list_insert_head(&zf->zf_stream, zs);
 }
 
 /*
- * Given a zfetch and zstream structure, remove the zstream structure from its
- * container in the zfetch structure.  Perform the appropriate book-keeping.
+ * This is the predictive prefetch entry point.  It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ *   FALSE -- prefetch only indirect blocks for predicted data blocks;
+ *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
  */
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
-	list_remove(&zf->zf_stream, zs);
-	zf->zf_stream_cnt--;
-}
-
-static int
-dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
-{
-	if (zs1->zst_offset != zs2->zst_offset)
-		return (0);
-
-	if (zs1->zst_len != zs2->zst_len)
-		return (0);
-
-	if (zs1->zst_stride != zs2->zst_stride)
-		return (0);
-
-	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
-		return (0);
-
-	if (zs1->zst_cap != zs2->zst_cap)
-		return (0);
-
-	if (zs1->zst_direction != zs2->zst_direction)
-		return (0);
-
-	return (1);
-}
-
-/*
- * This is the prefetch entry point.  It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
- */
 void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
 {
-	zstream_t	zst;
-	zstream_t	*newstream;
-	boolean_t	fetched;
-	int		inserted;
-	unsigned int	blkshft;
-	uint64_t	blksz;
+	zstream_t *zs;
+	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+	int64_t pf_ahead_blks, max_blks;
+	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+	uint64_t end_of_access_blkid = blkid + nblks;
 
 	if (zfs_prefetch_disable)
 		return;
 
-	/* files that aren't ln2 blocksz are only one block -- nothing to do */
-	if (!zf->zf_dnode->dn_datablkshift)
+	/*
+	 * As a fast path for small (single-block) files, ignore access
+	 * to the first block.
+	 */
+	if (blkid == 0)
 		return;
 
-	/* convert offset and size, into blockid and nblocks */
-	blkshft = zf->zf_dnode->dn_datablkshift;
-	blksz = (1 << blkshft);
+	rw_enter(&zf->zf_rwlock, RW_READER);
 
-	bzero(&zst, sizeof (zstream_t));
-	zst.zst_offset = offset >> blkshft;
-	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
-	    P2ALIGN(offset, blksz)) >> blkshft;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid == zs->zs_blkid) {
+			mutex_enter(&zs->zs_lock);
+			/*
+			 * zs_blkid could have changed before we
+			 * acquired zs_lock; re-check them here.
+			 */
+			if (blkid != zs->zs_blkid) {
+				mutex_exit(&zs->zs_lock);
+				continue;
+			}
+			break;
+		}
+	}
 
-	fetched = dmu_zfetch_find(zf, &zst, prefetched);
-	if (fetched) {
-		ZFETCHSTAT_BUMP(zfetchstat_hits);
-	} else {
+	if (zs == NULL) {
+		/*
+		 * This access is not part of any existing stream.  Create
+		 * a new stream for it.
+		 */
 		ZFETCHSTAT_BUMP(zfetchstat_misses);
-		fetched = dmu_zfetch_colinear(zf, &zst);
-		if (fetched) {
-			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
-		} else {
-			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
-		}
+		if (rw_tryupgrade(&zf->zf_rwlock))
+			dmu_zfetch_stream_create(zf, end_of_access_blkid);
+		rw_exit(&zf->zf_rwlock);
+		return;
 	}
 
-	if (!fetched) {
-		newstream = dmu_zfetch_stream_reclaim(zf);
+	/*
+	 * This access was to a block that we issued a prefetch for on
+	 * behalf of this stream. Issue further prefetches for this stream.
+	 *
+	 * Normally, we start prefetching where we stopped
+	 * prefetching last (zs_pf_blkid).  But when we get our first
+	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+	 * want to prefetch the block we just accessed.  In this case,
+	 * start just after the block we just accessed.
+	 */
+	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
 
+	/*
+	 * Double our amount of prefetched data, but don't let the
+	 * prefetch get further ahead than zfetch_max_distance.
+	 */
+	if (fetch_data) {
+		max_dist_blks =
+		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
 		/*
-		 * we still couldn't find a stream, drop the lock, and allocate
-		 * one if possible.  Otherwise, give up and go home.
+		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
+		 * want to now be double that, so read that amount again,
+		 * plus the amount we are catching up by (i.e. the amount
+		 * read just now).
 		 */
-		if (newstream) {
-			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
-		} else {
-			uint64_t	maxblocks;
-			uint32_t	max_streams;
-			uint32_t	cur_streams;
+		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+		pf_nblks = MIN(pf_ahead_blks, max_blks);
+	} else {
+		pf_nblks = 0;
+	}
 
-			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
-			cur_streams = zf->zf_stream_cnt;
-			maxblocks = zf->zf_dnode->dn_maxblkid;
+	zs->zs_pf_blkid = pf_start + pf_nblks;
 
-			max_streams = MIN(zfetch_max_streams,
-			    (maxblocks / zfetch_block_cap));
-			if (max_streams == 0) {
-				max_streams++;
-			}
+	/*
+	 * Do the same for indirects, starting from where we stopped last,
+	 * or where we will stop reading data blocks (and the indirects
+	 * that point to them).
+	 */
+	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+	/*
+	 * We want to double our distance ahead of the data prefetch
+	 * (or reader, if we are not prefetching data).  Previously, we
+	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
+	 * that amount again, plus the amount we are catching up by
+	 * (i.e. the amount read now + the amount of data prefetched now).
+	 */
+	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+	ipf_nblks = MIN(pf_ahead_blks, max_blks);
+	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
 
-			if (cur_streams >= max_streams) {
-				return;
-			}
-			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
-		}
+	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
 
-		newstream->zst_offset = zst.zst_offset;
-		newstream->zst_len = zst.zst_len;
-		newstream->zst_stride = zst.zst_len;
-		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
-		newstream->zst_cap = zst.zst_len;
-		newstream->zst_direction = ZFETCH_FORWARD;
-		newstream->zst_last = ddi_get_lbolt();
+	zs->zs_atime = gethrtime();
+	zs->zs_blkid = end_of_access_blkid;
+	mutex_exit(&zs->zs_lock);
+	rw_exit(&zf->zf_rwlock);
 
-		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * dbuf_prefetch() is asynchronous (even when it needs to read
+	 * indirect blocks), but we still prefer to drop our locks before
+	 * calling it to reduce the time we hold them.
+	 */
 
-		rw_enter(&zf->zf_rwlock, RW_WRITER);
-		inserted = dmu_zfetch_stream_insert(zf, newstream);
-		rw_exit(&zf->zf_rwlock);
-
-		if (!inserted) {
-			mutex_destroy(&newstream->zst_lock);
-			kmem_free(newstream, sizeof (zstream_t));
-		}
+	for (int i = 0; i < pf_nblks; i++) {
+		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
 	}
+	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+		dbuf_prefetch(zf->zf_dnode, 1, iblk,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+	}
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -36,9 +38,8 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
 
-static int free_range_compar(const void *node1, const void *node2);
-
 static kmem_cache_t *dnode_cache;
 /*
  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
@@ -59,10 +60,47 @@
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
-#ifdef sun
+#ifdef illumos
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+	const dmu_buf_impl_t *d1 = x1;
+	const dmu_buf_impl_t *d2 = x2;
+
+	if (d1->db_level < d2->db_level) {
+		return (-1);
+	}
+	if (d1->db_level > d2->db_level) {
+		return (1);
+	}
+
+	if (d1->db_blkid < d2->db_blkid) {
+		return (-1);
+	}
+	if (d1->db_blkid > d2->db_blkid) {
+		return (1);
+	}
+
+	if (d1->db_state == DB_SEARCH) {
+		ASSERT3S(d2->db_state, !=, DB_SEARCH);
+		return (-1);
+	} else if (d2->db_state == DB_SEARCH) {
+		ASSERT3S(d1->db_state, !=, DB_SEARCH);
+		return (1);
+	}
+
+	if ((uintptr_t)d1 < (uintptr_t)d2) {
+		return (-1);
+	}
+	if ((uintptr_t)d1 > (uintptr_t)d2) {
+		return (1);
+	}
+	return (0);
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -93,9 +131,7 @@
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_link_init(&dn->dn_dirty_link[i]);
-		avl_create(&dn->dn_ranges[i], free_range_compar,
-		    sizeof (free_range_t),
-		    offsetof(struct free_range, fr_node));
+		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -118,7 +154,8 @@
 	dn->dn_id_flags = 0;
 
 	dn->dn_dbufs_count = 0;
-	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	dn->dn_unlisted_l0_blkid = 0;
+	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
@@ -143,7 +180,7 @@
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		avl_destroy(&dn->dn_ranges[i]);
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
@@ -171,7 +208,8 @@
 	ASSERT0(dn->dn_id_flags);
 
 	ASSERT0(dn->dn_dbufs_count);
-	list_destroy(&dn->dn_dbufs);
+	ASSERT0(dn->dn_unlisted_l0_blkid);
+	avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -315,19 +353,6 @@
 	}
 }
 
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-	const free_range_t *rp1 = node1;
-	const free_range_t *rp2 = node2;
-
-	if (rp1->fr_blkid < rp2->fr_blkid)
-		return (-1);
-	else if (rp1->fr_blkid > rp2->fr_blkid)
-		return (1);
-	else return (0);
-}
-
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
@@ -376,7 +401,7 @@
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-	dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
@@ -383,8 +408,9 @@
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	dnode_t *dn;
 
+	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 	dn->dn_moved = 0;
 
@@ -421,13 +447,31 @@
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	mutex_enter(&os->os_lock);
-	list_insert_head(&os->os_dnodes, dn);
+	if (dnh->dnh_dnode != NULL) {
+		/* Lost the allocation race. */
+		mutex_exit(&os->os_lock);
+		kmem_cache_free(dnode_cache, dn);
+		return (dnh->dnh_dnode);
+	}
+
+	/*
+	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
+	 * signifies that the special dnodes have no references from
+	 * their children (the entries in os_dnodes).  This allows
+	 * dnode_destroy() to easily determine if the last child has
+	 * been removed and then complete eviction of the objset.
+	 */
+	if (!DMU_OBJECT_IS_SPECIAL(object))
+		list_insert_head(&os->os_dnodes, dn);
 	membar_producer();
+
 	/*
-	 * Everything else must be valid before assigning dn_objset makes the
-	 * dnode eligible for dnode_move().
+	 * Everything else must be valid before assigning dn_objset
+	 * makes the dnode eligible for dnode_move().
 	 */
 	dn->dn_objset = os;
+
+	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
@@ -441,12 +485,18 @@
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
+	boolean_t complete_os_eviction = B_FALSE;
 
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
 	POINTER_INVALIDATE(&dn->dn_objset);
-	list_remove(&os->os_dnodes, dn);
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		list_remove(&os->os_dnodes, dn);
+		complete_os_eviction =
+		    list_is_empty(&os->os_dnodes) &&
+		    list_link_active(&os->os_evicting_node);
+	}
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
@@ -463,7 +513,7 @@
 	}
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
+		dbuf_destroy(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
@@ -476,10 +526,14 @@
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_id_flags = 0;
+	dn->dn_unlisted_l0_blkid = 0;
 
-	dmu_zfetch_rele(&dn->dn_zfetch);
+	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+	if (complete_os_eviction)
+		dmu_objset_evict_done(os);
 }
 
 void
@@ -488,10 +542,10 @@
 {
 	int i;
 
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
-	else if (blocksize > SPA_MAXBLOCKSIZE)
-		blocksize = SPA_MAXBLOCKSIZE;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -519,7 +573,7 @@
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
@@ -531,7 +585,7 @@
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-		ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
@@ -572,7 +626,8 @@
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
@@ -695,7 +750,8 @@
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
-	bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
@@ -703,9 +759,10 @@
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-	ASSERT(list_is_empty(&ndn->dn_dbufs));
-	list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ASSERT(avl_is_empty(&ndn->dn_dbufs));
+	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
+	ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
 	ndn->dn_bonus = odn->dn_bonus;
 	ndn->dn_have_spill = odn->dn_have_spill;
 	ndn->dn_zio = odn->dn_zio;
@@ -719,8 +776,6 @@
 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-	ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
-	ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
 	/*
 	 * Update back pointers. Updating the handle fixes the back pointer of
@@ -737,9 +792,10 @@
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
-	list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
+	odn->dn_unlisted_l0_blkid = 0;
 	odn->dn_bonus = NULL;
 	odn->dn_zfetch.zf_dnode = NULL;
 
@@ -756,8 +812,7 @@
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-		odn->dn_ranges[i].avl_root = NULL;
-		odn->dn_ranges[i].avl_numnodes = 0;
+		odn->dn_free_ranges[i] = NULL;
 		odn->dn_next_nlevels[i] = 0;
 		odn->dn_next_indblkshift[i] = 0;
 		odn->dn_next_bonustype[i] = 0;
@@ -787,7 +842,7 @@
 	odn->dn_moved = (uint8_t)-1;
 }
 
-#ifdef sun
+#ifdef illumos
 #ifdef	_KERNEL
 /*ARGSUSED*/
 static kmem_cbrc_t
@@ -930,7 +985,7 @@
 	return (KMEM_CBRC_YES);
 }
 #endif	/* _KERNEL */
-#endif	/* sun */
+#endif	/* illumos */
 
 void
 dnode_special_close(dnode_handle_t *dnh)
@@ -945,6 +1000,8 @@
 	 */
 	while (refcount_count(&dn->dn_holds) > 0)
 		delay(1);
+	ASSERT(dn->dn_dbuf == NULL ||
+	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 	zrl_add(&dnh->dnh_zrlock);
 	dnode_destroy(dn); /* implicit zrl_remove() */
 	zrl_destroy(&dnh->dnh_zrlock);
@@ -951,27 +1008,24 @@
 	dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-	dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-	dnh->dnh_dnode = dn;
+	dnode_t *dn;
+
+	dn = dnode_create(os, dnp, NULL, object, dnh);
 	zrl_init(&dnh->dnh_zrlock);
 	DNODE_VERIFY(dn);
-	return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_pageout(void *dbu)
 {
-	dnode_children_t *children_dnodes = arg;
+	dnode_children_t *children_dnodes = dbu;
 	int i;
-	int epb = db->db_size >> DNODE_SHIFT;
 
-	ASSERT(epb == children_dnodes->dnc_count);
-
-	for (i = 0; i < epb; i++) {
+	for (i = 0; i < children_dnodes->dnc_count; i++) {
 		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 		dnode_t *dn;
 
@@ -1001,7 +1055,7 @@
 		dnh->dnh_dnode = NULL;
 	}
 	kmem_free(children_dnodes, sizeof (dnode_children_t) +
-	    (epb - 1) * sizeof (dnode_handle_t));
+	    children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
@@ -1062,7 +1116,7 @@
 		drop_struct_lock = TRUE;
 	}
 
-	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
@@ -1086,17 +1140,23 @@
 		int i;
 		dnode_children_t *winner;
 		children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
-		    (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+		    epb * sizeof (dnode_handle_t), KM_SLEEP);
 		children_dnodes->dnc_count = epb;
 		dnh = &children_dnodes->dnc_children[0];
 		for (i = 0; i < epb; i++) {
 			zrl_init(&dnh[i].dnh_zrlock);
-			dnh[i].dnh_dnode = NULL;
 		}
-		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-		    dnode_buf_pageout)) {
+		dmu_buf_init_user(&children_dnodes->dnc_dbu,
+		    dnode_buf_pageout, NULL);
+		winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+		if (winner != NULL) {
+
+			for (i = 0; i < epb; i++) {
+				zrl_destroy(&dnh[i].dnh_zrlock);
+			}
+
 			kmem_free(children_dnodes, sizeof (dnode_children_t) +
-			    (epb - 1) * sizeof (dnode_handle_t));
+			    epb * sizeof (dnode_handle_t));
 			children_dnodes = winner;
 		}
 	}
@@ -1104,17 +1164,11 @@
 
 	dnh = &children_dnodes->dnc_children[idx];
 	zrl_add(&dnh->dnh_zrlock);
-	if ((dn = dnh->dnh_dnode) == NULL) {
+	dn = dnh->dnh_dnode;
+	if (dn == NULL) {
 		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-		dnode_t *winner;
 
 		dn = dnode_create(os, phys, db, object, dnh);
-		winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-		if (winner != NULL) {
-			zrl_add(&dnh->dnh_zrlock);
-			dnode_destroy(dn); /* implicit zrl_remove() */
-			dn = winner;
-		}
 	}
 
 	mutex_enter(&dn->dn_mtx);
@@ -1128,10 +1182,10 @@
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
+	if (refcount_add(&dn->dn_holds, tag) == 1)
+		dbuf_add_ref(db, dnh);
 	mutex_exit(&dn->dn_mtx);
 
-	if (refcount_add(&dn->dn_holds, tag) == 1)
-		dbuf_add_ref(db, dnh);
 	/* Now we can rely on the hold to prevent the dnode from moving. */
 	zrl_remove(&dnh->dnh_zrlock);
 
@@ -1174,12 +1228,18 @@
 void
 dnode_rele(dnode_t *dn, void *tag)
 {
+	mutex_enter(&dn->dn_mtx);
+	dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
+{
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
 	dmu_buf_impl_t *db = dn->dn_dbuf;
 	dnode_handle_t *dnh = dn->dn_handle;
 
-	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
 	mutex_exit(&dn->dn_mtx);
 
@@ -1243,7 +1303,8 @@
 		return;
 	}
 
-	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1316,13 +1377,12 @@
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db, *db_next;
+	dmu_buf_impl_t *db;
 	int err;
 
+	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
-	if (size > SPA_MAXBLOCKSIZE)
-		size = SPA_MAXBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -1335,13 +1395,12 @@
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
-	if (dn->dn_phys->dn_maxblkid != 0)
+	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-
+	for (db = avl_first(&dn->dn_dbufs); db != NULL;
+	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
@@ -1354,7 +1413,7 @@
 		goto fail;
 
 	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
@@ -1462,56 +1521,13 @@
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
-	avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-	avl_index_t where;
-	free_range_t *rp;
-	free_range_t rp_tofind;
-	uint64_t endblk = blkid + nblks;
-
-	ASSERT(MUTEX_HELD(&dn->dn_mtx));
-	ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-	    blkid, nblks, tx->tx_txg);
-	rp_tofind.fr_blkid = blkid;
-	rp = avl_find(tree, &rp_tofind, &where);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_BEFORE);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_AFTER);
-
-	while (rp && (rp->fr_blkid <= blkid + nblks)) {
-		uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-		free_range_t *nrp = AVL_NEXT(tree, rp);
-
-		if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-			/* clear this entire range */
-			avl_remove(tree, rp);
-			kmem_free(rp, sizeof (free_range_t));
-		} else if (blkid <= rp->fr_blkid &&
-		    endblk > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear the beginning of this range */
-			rp->fr_blkid = endblk;
-			rp->fr_nblks = fr_endblk - endblk;
-		} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-		    endblk >= fr_endblk) {
-			/* clear the end of this range */
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear a chunk out of this range */
-			free_range_t *new_rp =
-			    kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
-			new_rp->fr_blkid = endblk;
-			new_rp->fr_nblks = fr_endblk - endblk;
-			avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		}
-		/* there may be no overlap */
-		rp = nrp;
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
 	}
 }
 
@@ -1529,7 +1545,7 @@
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
-	if (len == -1ULL) {
+	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
@@ -1545,7 +1561,13 @@
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
-			/* Freeing the whole block; fast-track this request */
+			/*
+			 * Freeing the whole block; fast-track this request.
+			 * Note that we won't dirty any indirect blocks,
+			 * which is fine because we will be freeing the entire
+			 * file and thus all indirect blocks will be freed
+			 * by free_children().
+			 */
 			blkid = 0;
 			nblks = 1;
 			goto done;
@@ -1564,8 +1586,8 @@
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-		    FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
@@ -1572,7 +1594,7 @@
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
-				dbuf_will_dirty(db, tx);
+				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				data = db->db.db_data;
 				bzero(data + blkoff, head);
@@ -1602,13 +1624,13 @@
 	if (tail) {
 		if (len < tail)
 			tail = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-		    TRUE, FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
-				dbuf_will_dirty(db, tx);
+				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
@@ -1629,45 +1651,70 @@
 		nblks += 1;
 
 	/*
-	 * Read in and mark all the level-1 indirects dirty,
-	 * so that they will stay in memory until syncing phase.
-	 * Always dirty the first and last indirect to make sure
-	 * we dirty all the partial indirects.
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
-		uint64_t i, first, last;
-		int shift = epbs + dn->dn_datablkshift;
+		uint64_t first, last;
 
 		first = blkid >> epbs;
-		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
+		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
-		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-		for (i = first + 1; i < last; i++) {
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
+
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (uint64_t i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
 			uint64_t ibyte = i << shift;
-			int err;
-
-			err = dnode_next_offset(dn,
-			    DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH || i >= last)
+			if (i >= last)
 				break;
-			ASSERT(err == 0);
-			db = dbuf_hold_level(dn, 1, i, FTAG);
-			if (db) {
-				dbuf_will_dirty(db, tx);
-				dbuf_rele(db, FTAG);
-			}
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
 		}
 	}
+
 done:
 	/*
 	 * Add this range to the dnode range list.
@@ -1674,29 +1721,20 @@
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
-	dnode_clear_range(dn, blkid, nblks, tx);
-	{
-		free_range_t *rp, *found;
-		avl_index_t where;
-		avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-		/* Add new range to dn_ranges */
-		rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-		rp->fr_blkid = blkid;
-		rp->fr_nblks = nblks;
-		found = avl_find(tree, rp, &where);
-		ASSERT(found == NULL);
-		avl_insert(tree, rp, where);
-		dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-		    blkid, nblks, tx->tx_txg);
+	int txgoff = tx->tx_txg & TXG_MASK;
+	if (dn->dn_free_ranges[txgoff] == NULL) {
+		dn->dn_free_ranges[txgoff] =
+		    range_tree_create(NULL, NULL, &dn->dn_mtx);
 	}
+	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
-	if (trunc && dn->dn_maxblkid >= (off >> blkshift))
-		dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
 
 	rw_exit(&dn->dn_struct_rwlock);
 }
@@ -1719,7 +1757,6 @@
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-	free_range_t range_tofind;
 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;
 
@@ -1739,21 +1776,11 @@
 	if (blkid == DMU_SPILL_BLKID)
 		return (dnode_spill_freed(dn));
 
-	range_tofind.fr_blkid = blkid;
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
-		free_range_t *range_found;
-		avl_index_t idx;
-
-		range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-		if (range_found) {
-			ASSERT(range_found->fr_nblks > 0);
+		if (dn->dn_free_ranges[i] != NULL &&
+		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
-		}
-		range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-		if (range_found &&
-		    range_found->fr_blkid + range_found->fr_nblks > blkid)
-			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
@@ -1789,9 +1816,8 @@
 }
 
 /*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
  */
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
@@ -1798,14 +1824,14 @@
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int64_t aspace = spa_get_asize(os->os_spa, space);
 
-	if (space > 0)
-		space = spa_get_asize(os->os_spa, space);
+	if (ds != NULL) {
+		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+	}
 
-	if (ds)
-		dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
-	dmu_tx_willuse_space(tx, space);
+	dmu_tx_willuse_space(tx, aspace);
 }
 
 /*
@@ -1828,7 +1854,7 @@
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-	int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
@@ -1850,8 +1876,8 @@
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
-		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
@@ -1874,8 +1900,10 @@
 		data = db->db.db_data;
 	}
 
-	if (db && txg &&
-	    (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+	    db->db_blkptr->blk_birth <= txg ||
+	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
@@ -1909,8 +1937,8 @@
 		*offset = *offset >> span;
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
-			if (bp[i].blk_fill >= minfill &&
-			    bp[i].blk_fill <= maxfill &&
+			if (BP_GET_FILL(&bp[i]) >= minfill &&
+			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
 			if (inc > 0 || *offset > 0)
@@ -1997,6 +2025,15 @@
 		    flags, offset, lvl, blkfill, txg);
 	}
 
+	/*
+	 * There's always a "virtual hole" at the end of the object, even
+	 * if all BP's which physically exist are non-holes.
+	 */
+	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+		error = 0;
+	}
+
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
 		error = SET_ERROR(ESRCH);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -33,6 +34,8 @@
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
 
 static void
 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
@@ -58,24 +61,19 @@
 	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
 	    dn->dn_object, dn->dn_phys->dn_nlevels);
 
-	/* check for existing blkptrs in the dnode */
-	for (i = 0; i < nblkptr; i++)
-		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
-			break;
-	if (i != nblkptr) {
-		/* transfer dnode's block pointers to new indirect block */
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
-		ASSERT(db->db.db_data);
-		ASSERT(arc_released(db->db_buf));
-		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
-		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
-		    sizeof (blkptr_t) * nblkptr);
-		arc_buf_freeze(db->db_buf);
-	}
+	/* transfer dnode's block pointers to new indirect block */
+	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+	ASSERT(db->db.db_data);
+	ASSERT(arc_released(db->db_buf));
+	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+	    sizeof (blkptr_t) * nblkptr);
+	arc_buf_freeze(db->db_buf);
 
 	/* set dbuf's parent pointers to new indirect buf */
 	for (i = 0; i < nblkptr; i++) {
-		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+		dmu_buf_impl_t *child =
+		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
 
 		if (child == NULL)
 			continue;
@@ -113,26 +111,44 @@
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
-static int
+static void
 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	uint64_t bytesfreed = 0;
-	int i, blocks_freed = 0;
 
 	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 
-	for (i = 0; i < num; i++, bp++) {
+	for (int i = 0; i < num; i++, bp++) {
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+		/*
+		 * Save some useful information on the holes being
+		 * punched, including logical size, type, and indirection
+		 * level. Retaining birth time enables detection of when
+		 * holes are punched for reducing the number of free
+		 * records transmitted during a zfs send.
+		 */
+
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		dmu_object_type_t type = BP_GET_TYPE(bp);
+		uint64_t lvl = BP_GET_LEVEL(bp);
+
 		bzero(bp, sizeof (blkptr_t));
-		blocks_freed += 1;
+
+		if (spa_feature_is_active(dn->dn_objset->os_spa,
+		    SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, type);
+			BP_SET_LEVEL(bp, lvl);
+			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+		}
 	}
 	dnode_diduse_space(dn, -bytesfreed);
-	return (blocks_freed);
 }
 
 #ifdef ZFS_DEBUG
@@ -167,7 +183,7 @@
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		err = dbuf_hold_impl(dn, db->db_level-1,
-		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
@@ -214,10 +230,8 @@
 }
 #endif
 
-#define	ALL -1
-
-static int
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
@@ -224,22 +238,19 @@
 	blkptr_t *bp;
 	dmu_buf_impl_t *subdb;
 	uint64_t start, end, dbstart, dbend, i;
-	int epbs, shift, err;
-	int all = TRUE;
-	int blocks_freed = 0;
+	int epbs, shift;
 
 	/*
 	 * There is a small possibility that this block will not be cached:
 	 *   1 - if level > 1 and there are no children with level <= 1
-	 *   2 - if we didn't get a dirty hold (because this block had just
-	 *	 finished being written -- and so had no holds), and then this
-	 *	 block got evicted before we got here.
+	 *   2 - if this block was evicted since we read it from
+	 *	 dmu_tx_hold_free().
 	 */
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
 	dbuf_release_bp(db);
-	bp = (blkptr_t *)db->db.db_data;
+	bp = db->db.db_data;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
@@ -249,7 +260,6 @@
 	start = blkid >> shift;
 	if (dbstart < start) {
 		bp += start - dbstart;
-		all = FALSE;
 	} else {
 		start = dbstart;
 	}
@@ -257,49 +267,46 @@
 	end = (blkid + nblks - 1) >> shift;
 	if (dbend <= end)
 		end = dbend;
-	else if (all)
-		all = trunc;
+
 	ASSERT3U(start, <=, end);
 
 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
-		blocks_freed = free_blocks(dn, bp, end-start+1, tx);
-		arc_buf_freeze(db->db_buf);
-		ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
-		DB_DNODE_EXIT(db);
-		return (all ? ALL : blocks_freed);
+		free_blocks(dn, bp, end-start+1, tx);
+	} else {
+		for (i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+			    i, TRUE, FALSE, FTAG, &subdb));
+			rw_exit(&dn->dn_struct_rwlock);
+			ASSERT3P(bp, ==, subdb->db_blkptr);
+
+			free_children(subdb, blkid, nblks, tx);
+			dbuf_rele(subdb, FTAG);
+		}
 	}
 
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
-		ASSERT0(err);
-		rw_exit(&dn->dn_struct_rwlock);
+	/* If this whole block is free, free ourself too. */
+	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
+	}
+	if (i == 1 << epbs) {
+		/* didn't find any non-holes */
+		bzero(db->db.db_data, db->db.db_size);
+		free_blocks(dn, db->db_blkptr, 1, tx);
+	} else {
+		/*
+		 * Partial block free; must be marked dirty so that it
+		 * will be written out.
+		 */
+		ASSERT(db->db_dirtycnt > 0);
+	}
 
-		if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
-			ASSERT3P(subdb->db_blkptr, ==, bp);
-			blocks_freed += free_blocks(dn, bp, 1, tx);
-		} else {
-			all = FALSE;
-		}
-		dbuf_rele(subdb, FTAG);
-	}
 	DB_DNODE_EXIT(db);
 	arc_buf_freeze(db->db_buf);
-#ifdef ZFS_DEBUG
-	bp -= (end-start)+1;
-	for (i = start; i <= end; i++, bp++) {
-		if (i == start && blkid != 0)
-			continue;
-		else if (i == end && !trunc)
-			continue;
-		ASSERT0(bp->blk_birth);
-	}
-#endif
-	ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
-	return (all ? ALL : blocks_freed);
 }
 
 /*
@@ -307,20 +314,21 @@
  * and "free" all the blocks contained there.
  */
 static void
-dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+    dmu_tx_t *tx)
 {
 	blkptr_t *bp = dn->dn_phys->dn_blkptr;
-	dmu_buf_impl_t *db;
-	int trunc, start, end, shift, i, err;
 	int dnlevel = dn->dn_phys->dn_nlevels;
+	boolean_t trunc = B_FALSE;
 
 	if (blkid > dn->dn_phys->dn_maxblkid)
 		return;
 
 	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
-	trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
-	if (trunc)
+	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
 		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+		trunc = B_TRUE;
+	}
 
 	/* There are no indirect blocks in the object */
 	if (dnlevel == 1) {
@@ -329,41 +337,34 @@
 			return;
 		}
 		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		(void) free_blocks(dn, bp + blkid, nblks, tx);
-		if (trunc) {
-			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
-			ASSERT(off < dn->dn_phys->dn_maxblkid ||
-			    dn->dn_phys->dn_maxblkid == 0 ||
-			    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
-		}
-		return;
-	}
+		free_blocks(dn, bp + blkid, nblks, tx);
+	} else {
+		int shift = (dnlevel - 1) *
+		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+		int start = blkid >> shift;
+		int end = (blkid + nblks - 1) >> shift;
+		dmu_buf_impl_t *db;
 
-	shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
-	start = blkid >> shift;
-	ASSERT(start < dn->dn_phys->dn_nblkptr);
-	end = (blkid + nblks - 1) >> shift;
-	bp += start;
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
-		ASSERT0(err);
-		rw_exit(&dn->dn_struct_rwlock);
+		ASSERT(start < dn->dn_phys->dn_nblkptr);
+		bp += start;
+		for (int i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+			    TRUE, FALSE, FTAG, &db));
+			rw_exit(&dn->dn_struct_rwlock);
 
-		if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
-			ASSERT3P(db->db_blkptr, ==, bp);
-			(void) free_blocks(dn, bp, 1, tx);
+			free_children(db, blkid, nblks, tx);
+			dbuf_rele(db, FTAG);
 		}
-		dbuf_rele(db, FTAG);
 	}
+
 	if (trunc) {
+		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
 		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 		ASSERT(off < dn->dn_phys->dn_maxblkid ||
 		    dn->dn_phys->dn_maxblkid == 0 ||
 		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
@@ -370,6 +371,22 @@
 	}
 }
 
+typedef struct dnode_sync_free_range_arg {
+	dnode_t *dsfra_dnode;
+	dmu_tx_t *dsfra_tx;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+	dnode_sync_free_range_arg_t *dsfra = arg;
+	dnode_t *dn = dsfra->dsfra_dnode;
+
+	mutex_exit(&dn->dn_mtx);
+	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+	mutex_enter(&dn->dn_mtx);
+}
+
 /*
  * Try to kick all the dnode's dbufs out of the cache...
  */
@@ -376,59 +393,54 @@
 void
 dnode_evict_dbufs(dnode_t *dn)
 {
-	int progress;
-	int pass = 0;
+	dmu_buf_impl_t db_marker;
+	dmu_buf_impl_t *db, *db_next;
 
-	do {
-		dmu_buf_impl_t *db, marker;
-		int evicting = FALSE;
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 
-		progress = FALSE;
-		mutex_enter(&dn->dn_dbufs_mtx);
-		list_insert_tail(&dn->dn_dbufs, &marker);
-		db = list_head(&dn->dn_dbufs);
-		for (; db != ▮ db = list_head(&dn->dn_dbufs)) {
-			list_remove(&dn->dn_dbufs, db);
-			list_insert_tail(&dn->dn_dbufs, db);
 #ifdef	DEBUG
-			DB_DNODE_ENTER(db);
-			ASSERT3P(DB_DNODE(db), ==, dn);
-			DB_DNODE_EXIT(db);
+		DB_DNODE_ENTER(db);
+		ASSERT3P(DB_DNODE(db), ==, dn);
+		DB_DNODE_EXIT(db);
 #endif	/* DEBUG */
 
-			mutex_enter(&db->db_mtx);
-			if (db->db_state == DB_EVICTING) {
-				progress = TRUE;
-				evicting = TRUE;
-				mutex_exit(&db->db_mtx);
-			} else if (refcount_is_zero(&db->db_holds)) {
-				progress = TRUE;
-				dbuf_clear(db); /* exits db_mtx for us */
-			} else {
-				mutex_exit(&db->db_mtx);
-			}
+		mutex_enter(&db->db_mtx);
+		if (db->db_state != DB_EVICTING &&
+		    refcount_is_zero(&db->db_holds)) {
+			db_marker.db_level = db->db_level;
+			db_marker.db_blkid = db->db_blkid;
+			db_marker.db_state = DB_SEARCH;
+			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
+			    AVL_BEFORE);
 
+			dbuf_destroy(db);
+
+			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
+			avl_remove(&dn->dn_dbufs, &db_marker);
+		} else {
+			db->db_pending_evict = TRUE;
+			mutex_exit(&db->db_mtx);
+			db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		}
-		list_remove(&dn->dn_dbufs, &marker);
-		/*
-		 * NB: we need to drop dn_dbufs_mtx between passes so
-		 * that any DB_EVICTING dbufs can make progress.
-		 * Ideally, we would have some cv we could wait on, but
-		 * since we don't, just wait a bit to give the other
-		 * thread a chance to run.
-		 */
-		mutex_exit(&dn->dn_dbufs_mtx);
-		if (evicting)
-			delay(1);
-		pass++;
-		ASSERT(pass < 100); /* sanity check */
-	} while (progress);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
 
+	dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
-		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
-		dn->dn_bonus = NULL;
+	if (dn->dn_bonus != NULL) {
+		if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+			mutex_enter(&dn->dn_bonus->db_mtx);
+			dbuf_destroy(dn->dn_bonus);
+			dn->dn_bonus = NULL;
+		} else {
+			dn->dn_bonus->db_pending_evict = TRUE;
+		}
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 }
@@ -456,8 +468,8 @@
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
 		} else {
+			mutex_destroy(&dr->dt.di.dr_mtx);
 			list_destroy(&dr->dt.di.dr_children);
-			mutex_destroy(&dr->dt.di.dr_mtx);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
@@ -480,8 +492,6 @@
 
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-	ASSERT3P(dn->dn_bonus, ==, NULL);
 
 	/*
 	 * XXX - It would be nice to assert this, but we may still
@@ -504,7 +514,7 @@
 
 	ASSERT(dn->dn_free_txg > 0);
 	if (dn->dn_allocated_txg != dn->dn_free_txg)
-		dbuf_will_dirty(dn->dn_dbuf, tx);
+		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
 	bzero(dn->dn_phys, sizeof (dnode_phys_t));
 
 	mutex_enter(&dn->dn_mtx);
@@ -530,7 +540,6 @@
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 {
-	free_range_t *rp;
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
@@ -572,26 +581,34 @@
 		dnp->dn_bonustype = dn->dn_bonustype;
 		dnp->dn_bonuslen = dn->dn_bonuslen;
 	}
-
 	ASSERT(dnp->dn_nlevels > 1 ||
 	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
 	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	ASSERT(dnp->dn_nlevels < 2 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
 
-	if (dn->dn_next_blksz[txgoff]) {
+	if (dn->dn_next_type[txgoff] != 0) {
+		dnp->dn_type = dn->dn_type;
+		dn->dn_next_type[txgoff] = 0;
+	}
+
+	if (dn->dn_next_blksz[txgoff] != 0) {
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
-		    avl_last(&dn->dn_ranges[txgoff]) ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
-		    dnp->dn_datablkszsec);
+		    dnp->dn_datablkszsec ||
+		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
 		dnp->dn_datablkszsec =
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 		dn->dn_next_blksz[txgoff] = 0;
 	}
 
-	if (dn->dn_next_bonuslen[txgoff]) {
+	if (dn->dn_next_bonuslen[txgoff] != 0) {
 		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
 			dnp->dn_bonuslen = 0;
 		else
@@ -600,25 +617,26 @@
 		dn->dn_next_bonuslen[txgoff] = 0;
 	}
 
-	if (dn->dn_next_bonustype[txgoff]) {
+	if (dn->dn_next_bonustype[txgoff] != 0) {
 		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
 		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
 		dn->dn_next_bonustype[txgoff] = 0;
 	}
 
+	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+	    dn->dn_free_txg <= tx->tx_txg;
+
 	/*
-	 * We will either remove a spill block when a file is being removed
-	 * or we have been asked to remove it.
+	 * Remove the spill block if we have been explicitly asked to
+	 * remove it, or if the object is being removed.
 	 */
-	if (dn->dn_rm_spillblk[txgoff] ||
-	    ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
-		if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 			kill_spill = B_TRUE;
 		dn->dn_rm_spillblk[txgoff] = 0;
 	}
 
-	if (dn->dn_next_indblkshift[txgoff]) {
+	if (dn->dn_next_indblkshift[txgoff] != 0) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
 		dn->dn_next_indblkshift[txgoff] = 0;
@@ -635,7 +653,7 @@
 	mutex_exit(&dn->dn_mtx);
 
 	if (kill_spill) {
-		(void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+		free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
 		mutex_enter(&dn->dn_mtx);
 		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
@@ -642,20 +660,28 @@
 	}
 
 	/* process all the "freed" ranges in the file */
-	while (rp = avl_last(&dn->dn_ranges[txgoff])) {
-		dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
-		/* grab the mutex so we don't race with dnode_block_freed() */
+	if (dn->dn_free_ranges[txgoff] != NULL) {
+		dnode_sync_free_range_arg_t dsfra;
+		dsfra.dsfra_dnode = dn;
+		dsfra.dsfra_tx = tx;
 		mutex_enter(&dn->dn_mtx);
-		avl_remove(&dn->dn_ranges[txgoff], rp);
+		range_tree_vacate(dn->dn_free_ranges[txgoff],
+		    dnode_sync_free_range, &dsfra);
+		range_tree_destroy(dn->dn_free_ranges[txgoff]);
+		dn->dn_free_ranges[txgoff] = NULL;
 		mutex_exit(&dn->dn_mtx);
-		kmem_free(rp, sizeof (free_range_t));
 	}
 
-	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+	if (freeing_dnode) {
 		dnode_sync_free(dn, tx);
 		return;
 	}
 
+	if (dn->dn_next_nlevels[txgoff]) {
+		dnode_increase_indirection(dn, tx);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
 	if (dn->dn_next_nblkptr[txgoff]) {
 		/* this should only happen on a realloc */
 		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
@@ -680,13 +706,8 @@
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	if (dn->dn_next_nlevels[txgoff]) {
-		dnode_increase_indirection(dn, tx);
-		dn->dn_next_nlevels[txgoff] = 0;
-	}
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
-	dbuf_sync_list(list, tx);
-
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);
 		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,13 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Portions Copyright (c) 2011 Martin Matuska <mm at FreeBSD.org>
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
@@ -33,6 +37,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
@@ -49,7 +54,28 @@
 #include <sys/dsl_deadlist.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
 
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+    &zfs_max_recordsize, 0,
+    "Maximum block size.  Expect dragons when tuning this.");
+
 #define	SWITCH64(x, y) \
 	{ \
 		uint64_t __tmp = (x); \
@@ -59,8 +85,12 @@
 
 #define	DS_REF_MAX	(1ULL << 62)
 
-#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
+extern int spa_asize_inflation;
+
+static zil_header_t zero_zil;
+
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
@@ -69,13 +99,15 @@
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
+	dsl_dataset_phys_t *ds_phys;
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
-	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+	ds_phys = dsl_dataset_phys(ds);
+	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
@@ -102,21 +134,30 @@
 		    used, compressed, uncompressed);
 		return;
 	}
+
+	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
-	ds->ds_phys->ds_referenced_bytes += used;
-	ds->ds_phys->ds_compressed_bytes += compressed;
-	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
-	ds->ds_phys->ds_unique_bytes += used;
+	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+	dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+		    B_TRUE;
+	}
+
+	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ds->ds_feature_activation_needed[f] = B_TRUE;
+
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
-	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
-	mutex_exit(&ds->ds_dir->dd_lock);
+	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 }
 
 int
@@ -123,6 +164,10 @@
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
 	if (BP_IS_HOLE(bp))
 		return (0);
 
@@ -129,11 +174,6 @@
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(bp->blk_birth <= tx->tx_txg);
 
-	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	ASSERT(used > 0);
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 		dsl_pool_mos_diduse_space(tx->tx_pool,
@@ -142,27 +182,25 @@
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
-	ASSERT(!dsl_dataset_is_snapshot(ds));
+	ASSERT(!ds->ds_is_snapshot);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
-	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
-		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
-		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
-		ds->ds_phys->ds_unique_bytes -= used;
+		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    delta, -compressed, -uncompressed, tx);
 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
-		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
-		mutex_exit(&ds->ds_dir->dd_lock);
+		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
@@ -178,15 +216,15 @@
 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    ds->ds_phys->ds_prev_snap_obj);
-		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object && bp->blk_birth >
-		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
-			ds->ds_prev->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
 		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
@@ -195,12 +233,12 @@
 		}
 	}
 	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
-	ds->ds_phys->ds_referenced_bytes -= used;
-	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
-	ds->ds_phys->ds_compressed_bytes -= compressed;
-	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
-	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
@@ -226,7 +264,7 @@
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
-	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
@@ -233,7 +271,8 @@
 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth)
 {
-	if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
+	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
+	    (bp != NULL && BP_IS_HOLE(bp)))
 		return (B_FALSE);
 
 	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
@@ -241,14 +280,15 @@
 	return (B_TRUE);
 }
 
-/* ARGSUSED */
 static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+dsl_dataset_evict(void *dbu)
 {
-	dsl_dataset_t *ds = dsv;
+	dsl_dataset_t *ds = dbu;
 
 	ASSERT(ds->ds_owner == NULL);
 
+	ds->ds_dbuf = NULL;
+
 	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_objset != NULL)
@@ -260,13 +300,14 @@
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
-	if (ds->ds_phys->ds_deadlist_obj != 0)
+	if (ds->ds_deadlist.dl_os != NULL)
 		dsl_deadlist_close(&ds->ds_deadlist);
 	if (ds->ds_dir)
-		dsl_dir_rele(ds->ds_dir, ds);
+		dsl_dir_async_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
+	list_destroy(&ds->ds_prop_cbs);
 	if (mutex_owned(&ds->ds_lock))
 		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
@@ -273,7 +314,9 @@
 	if (mutex_owned(&ds->ds_opening_lock))
 		mutex_exit(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_opening_lock);
+	mutex_destroy(&ds->ds_sendstream_lock);
 	refcount_destroy(&ds->ds_longholds);
+	rrw_destroy(&ds->ds_bp_rwlock);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
@@ -289,10 +332,10 @@
 
 	if (ds->ds_snapname[0])
 		return (0);
-	if (ds->ds_phys->ds_next_snap_obj == 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
 		return (0);
 
-	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err != 0)
 		return (err);
@@ -307,11 +350,11 @@
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
@@ -324,16 +367,17 @@
 }
 
 int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
@@ -341,9 +385,32 @@
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
+
+	if (err == 0 && adj_cnt)
+		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
 	return (err);
 }
 
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_t *dbuf = ds->ds_dbuf;
+	boolean_t result = B_FALSE;
+
+	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+		if (ds == dmu_buf_get_user(dbuf))
+			result = B_TRUE;
+		else
+			dmu_buf_rele(dbuf, tag);
+	}
+
+	return (result);
+}
+
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
@@ -362,7 +429,7 @@
 
 	/* Make sure dsobj has the correct object type. */
 	dmu_object_info_from_db(dbuf, &doi);
-	if (doi.doi_type != DMU_OT_DSL_DATASET) {
+	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
 		dmu_buf_rele(dbuf, tag);
 		return (SET_ERROR(EINVAL));
 	}
@@ -374,27 +441,46 @@
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
-		ds->ds_phys = dbuf->db_data;
+		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
 		refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
 		dsl_deadlist_open(&ds->ds_deadlist,
-		    mos, ds->ds_phys->ds_deadlist_obj);
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 		    offsetof(dmu_sendarg_t, dsa_link));
 
-		if (err == 0) {
-			err = dsl_dir_hold_obj(dp,
-			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+				if (!(spa_feature_table[f].fi_flags &
+				    ZFEATURE_FLAG_PER_DATASET))
+					continue;
+				err = zap_contains(mos, dsobj,
+				    spa_feature_table[f].fi_guid);
+				if (err == 0) {
+					ds->ds_feature_inuse[f] = B_TRUE;
+				} else {
+					ASSERT3U(err, ==, ENOENT);
+					err = 0;
+				}
+			}
 		}
+
+		err = dsl_dir_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
+			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
@@ -403,25 +489,34 @@
 			return (err);
 		}
 
-		if (!dsl_dataset_is_snapshot(ds)) {
+		if (!ds->ds_is_snapshot) {
 			ds->ds_snapname[0] = '\0';
-			if (ds->ds_phys->ds_prev_snap_obj != 0) {
+			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 				err = dsl_dataset_hold_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj,
+				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
+			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+				int zaperr = zap_lookup(mos, ds->ds_object,
+				    DS_FIELD_BOOKMARK_NAMES,
+				    sizeof (ds->ds_bookmarks), 1,
+				    &ds->ds_bookmarks);
+				if (zaperr != ENOENT)
+					VERIFY0(zaperr);
+			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
-			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+			if (err == 0 &&
+			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
-				    ds->ds_phys->ds_userrefs_obj,
+				    dsl_dataset_phys(ds)->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
-		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+		if (err == 0 && !ds->ds_is_snapshot) {
 			err = dsl_prop_get_int_ds(ds,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 			    &ds->ds_reserved);
@@ -434,8 +529,11 @@
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
-		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
-		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
+		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
+		if (err == 0)
+			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+		if (err != 0 || winner != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
@@ -443,6 +541,7 @@
 			dsl_dir_rele(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
+			mutex_destroy(&ds->ds_sendstream_lock);
 			refcount_destroy(&ds->ds_longholds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err != 0) {
@@ -452,12 +551,12 @@
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
-			    unique_insert(ds->ds_phys->ds_fsid_guid);
+			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
-	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	*dsp = ds;
@@ -472,6 +571,7 @@
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
+	dsl_dataset_t *ds;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
@@ -478,38 +578,39 @@
 		return (err);
 
 	ASSERT(dsl_pool_config_held(dp));
-	obj = dd->dd_phys->dd_head_dataset_obj;
+	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
-		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *ds;
+		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
-			dsl_dataset_rele(*dsp, tag);
+			dsl_dataset_rele(ds, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0)
-			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
-		dsl_dataset_rele(*dsp, tag);
+			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+		dsl_dataset_rele(ds, tag);
 
 		if (err == 0) {
-			mutex_enter(&ds->ds_lock);
-			if (ds->ds_snapname[0] == 0)
-				(void) strlcpy(ds->ds_snapname, snapname,
-				    sizeof (ds->ds_snapname));
-			mutex_exit(&ds->ds_lock);
-			*dsp = ds;
+			mutex_enter(&snap_ds->ds_lock);
+			if (snap_ds->ds_snapname[0] == 0)
+				(void) strlcpy(snap_ds->ds_snapname, snapname,
+				    sizeof (snap_ds->ds_snapname));
+			mutex_exit(&snap_ds->ds_lock);
+			ds = snap_ds;
 		}
 	}
-
+	if (err == 0)
+		*dsp = ds;
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
@@ -581,7 +682,8 @@
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
-			(void) strcat(name, "@");
+			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+			    <, ZFS_MAX_DATASET_NAME_LEN);
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
@@ -588,38 +690,27 @@
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
-				(void) strcat(name, ds->ds_snapname);
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
 				mutex_exit(&ds->ds_lock);
 			} else {
-				(void) strcat(name, ds->ds_snapname);
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
 			}
 		}
 	}
 }
 
-static int
+int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
-	int result;
-
-	if (ds == NULL) {
-		result = 3;	/* "mos" */
-	} else {
-		result = dsl_dir_namelen(ds->ds_dir);
-		VERIFY0(dsl_dataset_get_snapname(ds));
-		if (ds->ds_snapname[0]) {
-			++result;	/* adding one for the @-sign */
-			if (!MUTEX_HELD(&ds->ds_lock)) {
-				mutex_enter(&ds->ds_lock);
-				result += strlen(ds->ds_snapname);
-				mutex_exit(&ds->ds_lock);
-			} else {
-				result += strlen(ds->ds_snapname);
-			}
-		}
-	}
-
-	return (result);
+	VERIFY0(dsl_dataset_get_snapname(ds));
+	mutex_enter(&ds->ds_lock);
+	int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
+	mutex_exit(&ds->ds_lock);
+	return (len);
 }
 
 void
@@ -631,16 +722,14 @@
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-	ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
+	ASSERT3P(ds->ds_owner, ==, tag);
+	ASSERT(ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
 	mutex_exit(&ds->ds_lock);
 	dsl_dataset_long_rele(ds, tag);
-	if (ds->ds_dbuf != NULL)
-		dsl_dataset_rele(ds, tag);
-	else
-		dsl_dataset_evict(NULL, ds);
+	dsl_dataset_rele(ds, tag);
 }
 
 boolean_t
@@ -648,6 +737,7 @@
 {
 	boolean_t gotit = FALSE;
 
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
@@ -658,6 +748,44 @@
 	return (gotit);
 }
 
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+	boolean_t rv;
+	mutex_enter(&ds->ds_lock);
+	rv = (ds->ds_owner != NULL);
+	mutex_exit(&ds->ds_lock);
+	return (rv);
+}
+
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t zero = 0;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	spa_feature_incr(spa, f, tx);
+	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+	    sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+	spa_feature_decr(spa, f, tx);
+}
+
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
@@ -672,9 +800,9 @@
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
-	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
@@ -702,46 +830,62 @@
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
-		    origin->ds_phys->ds_creation_txg;
+		    dsl_dataset_phys(origin)->ds_creation_txg;
 		dsphys->ds_referenced_bytes =
-		    origin->ds_phys->ds_referenced_bytes;
+		    dsl_dataset_phys(origin)->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
-		    origin->ds_phys->ds_compressed_bytes;
+		    dsl_dataset_phys(origin)->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
-		    origin->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = origin->ds_phys->ds_bp;
-		dsphys->ds_flags |= origin->ds_phys->ds_flags;
+		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+		rrw_exit(&origin->ds_bp_rwlock, FTAG);
 
+		/*
+		 * Inherit flags that describe the dataset's contents
+		 * (INCONSISTENT) or properties (Case Insensitive).
+		 */
+		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+			if (origin->ds_feature_inuse[f])
+				dsl_dataset_activate_feature(dsobj, f, tx);
+		}
+
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		origin->ds_phys->ds_num_children++;
+		dsl_dataset_phys(origin)->ds_num_children++;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
-			if (origin->ds_phys->ds_next_clones_obj == 0) {
-				origin->ds_phys->ds_next_clones_obj =
+			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+				dsl_dataset_phys(origin)->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
-			    origin->ds_phys->ds_next_clones_obj, dsobj, tx));
+			    dsl_dataset_phys(origin)->ds_next_clones_obj,
+			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_origin_obj = origin->ds_object;
+		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-			if (origin->ds_dir->dd_phys->dd_clones == 0) {
+			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-				origin->ds_dir->dd_phys->dd_clones =
+				dsl_dir_phys(origin->ds_dir)->dd_clones =
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
-			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+			    dsl_dir_phys(origin->ds_dir)->dd_clones,
+			    dsobj, tx));
 		}
 	}
 
@@ -751,7 +895,7 @@
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
@@ -762,8 +906,20 @@
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
-	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
-	dsl_dataset_dirty(ds, tx);
+	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		zio_t *zio;
+
+		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+
+		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		dsl_dataset_sync(ds, zio, tx);
+		VERIFY0(zio_wait(zio));
+
+		/* dsl_dataset_sync_done will drop this reference. */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+		dsl_dataset_sync_done(ds, tx);
+	}
 }
 
 uint64_t
@@ -785,6 +941,21 @@
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
+	/*
+	 * Since we're creating a new node we know it's a leaf, so we can
+	 * initialize the counts if the limit feature is active.
+	 */
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		uint64_t cnt = 0;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		dsl_dir_zapify(dd, tx);
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+	}
+
 	dsl_dir_rele(dd, FTAG);
 
 	/*
@@ -856,10 +1027,10 @@
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
-	ASSERT(!dsl_dataset_is_snapshot(ds));
+	ASSERT(!ds->ds_is_snapshot);
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0)
-		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
@@ -866,12 +1037,12 @@
 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
-	ds->ds_phys->ds_unique_bytes =
-	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
+	dsl_dataset_phys(ds)->ds_unique_bytes =
+	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 void
@@ -882,8 +1053,9 @@
 	uint64_t count;
 	int err;
 
-	ASSERT(ds->ds_phys->ds_num_children >= 2);
-	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    obj, tx);
 	/*
 	 * The err should not be ENOENT, but a bug in a previous version
 	 * of the code could cause upgrade_clones_cb() to not set
@@ -896,9 +1068,9 @@
 	 */
 	if (err != ENOENT)
 		VERIFY0(err);
-	ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    &count));
-	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
 
@@ -905,22 +1077,9 @@
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
-	return (&ds->ds_phys->ds_bp);
+	return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* If it's the meta-objset, set dp_meta_rootbp */
-	if (ds == NULL) {
-		tx->tx_pool->dp_meta_rootbp = *bp;
-	} else {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_bp = *bp;
-	}
-}
-
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
@@ -937,11 +1096,13 @@
 
 	ASSERT(ds->ds_objset != NULL);
 
-	if (ds->ds_phys->ds_next_snap_obj != 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
+	/* Must not dirty a dataset in the same txg where it got snapshotted. */
+	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
 	dp = ds->ds_dir->dd_pool;
-
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
@@ -973,7 +1134,7 @@
 	 * outside of the reservation.
 	 */
 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
-	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
@@ -991,11 +1152,12 @@
 	nvlist_t *ddsa_snaps;
 	nvlist_t *ddsa_props;
 	nvlist_t *ddsa_errors;
+	cred_t *ddsa_cr;
 } dsl_dataset_snapshot_arg_t;
 
 int
 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
 {
 	int error;
 	uint64_t value;
@@ -1009,7 +1171,7 @@
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
-	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
 		return (SET_ERROR(EAGAIN));
 
 	/*
@@ -1021,6 +1183,30 @@
 	if (error != ENOENT)
 		return (error);
 
+	/*
+	 * We don't allow taking snapshots of inconsistent datasets, such as
+	 * those into which we are currently receiving.  However, if we are
+	 * creating this snapshot as part of a receive, this check will be
+	 * executed atomically with respect to the completion of the receive
+	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+	 * case we ignore this, knowing it will be fixed up for us shortly in
+	 * dmu_recv_end_sync().
+	 */
+	if (!recv && DS_IS_INCONSISTENT(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Skip the check for temporary snapshots or if we have already checked
+	 * the counts in dsl_dataset_snapshot_check. This means we really only
+	 * check the count here when we're receiving a stream.
+	 */
+	if (cnt != 0 && cr != NULL) {
+		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+		if (error != 0)
+			return (error);
+	}
+
 	error = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (error != 0)
 		return (error);
@@ -1036,15 +1222,108 @@
 	nvpair_t *pair;
 	int rv = 0;
 
+	/*
+	 * Pre-compute how many total new snapshots will be created for each
+	 * level in the tree and below. This is needed for validating the
+	 * snapshot limit when either taking a recursive snapshot or when
+	 * taking multiple snapshots.
+	 *
+	 * The problem is that the counts are not actually adjusted when
+	 * we are checking, only when we finally sync. For a single snapshot,
+	 * this is easy, the count will increase by 1 at each node up the tree,
+	 * but its more complicated for the recursive/multiple snapshot case.
+	 *
+	 * The dsl_fs_ss_limit_check function does recursively check the count
+	 * at each level up the tree but since it is validating each snapshot
+	 * independently we need to be sure that we are validating the complete
+	 * count for the entire set of snapshots. We do this by rolling up the
+	 * counts for each component of the name into an nvlist and then
+	 * checking each of those cases with the aggregated count.
+	 *
+	 * This approach properly handles not only the recursive snapshot
+	 * case (where we get all of those on the ddsa_snaps list) but also
+	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
+	 * validate the limit on 'a' using a count of 2).
+	 *
+	 * We validate the snapshot names in the third loop and only report
+	 * name errors once.
+	 */
+	if (dmu_tx_is_syncing(tx)) {
+		nvlist_t *cnt_track = NULL;
+		cnt_track = fnvlist_alloc();
+
+		/* Rollup aggregated counts into the cnt_track list */
+		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+		    pair != NULL;
+		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+			char *pdelim;
+			uint64_t val;
+			char nm[MAXPATHLEN];
+
+			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+			pdelim = strchr(nm, '@');
+			if (pdelim == NULL)
+				continue;
+			*pdelim = '\0';
+
+			do {
+				if (nvlist_lookup_uint64(cnt_track, nm,
+				    &val) == 0) {
+					/* update existing entry */
+					fnvlist_add_uint64(cnt_track, nm,
+					    val + 1);
+				} else {
+					/* add to list */
+					fnvlist_add_uint64(cnt_track, nm, 1);
+				}
+
+				pdelim = strrchr(nm, '/');
+				if (pdelim != NULL)
+					*pdelim = '\0';
+			} while (pdelim != NULL);
+		}
+
+		/* Check aggregated counts at each level */
+		for (pair = nvlist_next_nvpair(cnt_track, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+			int error = 0;
+			char *name;
+			uint64_t cnt = 0;
+			dsl_dataset_t *ds;
+
+			name = nvpair_name(pair);
+			cnt = fnvpair_value_uint64(pair);
+			ASSERT(cnt > 0);
+
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+			if (error == 0) {
+				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+				    ddsa->ddsa_cr);
+				dsl_dataset_rele(ds, FTAG);
+			}
+
+			if (error != 0) {
+				if (ddsa->ddsa_errors != NULL)
+					fnvlist_add_int32(ddsa->ddsa_errors,
+					    name, error);
+				rv = error;
+				/* only report one error for this check */
+				break;
+			}
+		}
+		nvlist_free(cnt_track);
+	}
+
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		int error = 0;
 		dsl_dataset_t *ds;
 		char *name, *atp;
-		char dsname[MAXNAMELEN];
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
-		if (strlen(name) >= MAXNAMELEN)
+		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
 			error = SET_ERROR(ENAMETOOLONG);
 		if (error == 0) {
 			atp = strchr(name, '@');
@@ -1056,8 +1335,9 @@
 		if (error == 0)
 			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 		if (error == 0) {
+			/* passing 0/NULL skips dsl_fs_ss_limit_check */
 			error = dsl_dataset_snapshot_check_impl(ds,
-			    atp + 1, tx);
+			    atp + 1, tx, B_FALSE, 0, NULL);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
@@ -1069,6 +1349,7 @@
 			rv = error;
 		}
 	}
+
 	return (rv);
 }
 
@@ -1076,8 +1357,6 @@
 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
-	static zil_header_t zero_zil;
-
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
@@ -1096,7 +1375,12 @@
 	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
 	    sizeof (zero_zil)) == 0);
 
+	/* Should not snapshot a dirty dataset. */
+	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+	    ds, tx->tx_txg));
 
+	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
@@ -1117,32 +1401,42 @@
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
-	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
-	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
-	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
-	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
-	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
-	dsphys->ds_flags = ds->ds_phys->ds_flags;
-	dsphys->ds_bp = ds->ds_phys->ds_bp;
+	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes =
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	dmu_buf_rele(dbuf, FTAG);
 
-	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f])
+			dsl_dataset_activate_feature(dsobj, f, tx);
+	}
+
+	ASSERT3U(ds->ds_prev != 0, ==,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
-		    ds->ds_prev->ds_phys->ds_next_clones_obj;
-		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object ||
-		    ds->ds_prev->ds_phys->ds_num_children > 1);
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-			    ds->ds_prev->ds_phys->ds_creation_txg);
-			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
@@ -1159,33 +1453,36 @@
 	if (ds->ds_reserved) {
 		int64_t delta;
 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+		    ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
 		    delta, 0, 0, tx);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
-	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
-	    ds->ds_phys->ds_prev_snap_txg, tx);
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
-	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = crtxg;
-	ds->ds_phys->ds_unique_bytes = 0;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
-	VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
 		dsl_dataset_rele(ds->ds_prev, ds);
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
@@ -1205,7 +1502,7 @@
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		dsl_dataset_t *ds;
 		char *name, *atp;
-		char dsname[MAXNAMELEN];
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
 		atp = strchr(name, '@');
@@ -1251,7 +1548,7 @@
 		suspended = fnvlist_alloc();
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
-			char fsname[MAXNAMELEN];
+			char fsname[ZFS_MAX_DATASET_NAME_LEN];
 			char *snapname = nvpair_name(pair);
 			char *atp;
 			void *cookie;
@@ -1274,11 +1571,12 @@
 	ddsa.ddsa_snaps = snaps;
 	ddsa.ddsa_props = props;
 	ddsa.ddsa_errors = errors;
+	ddsa.ddsa_cr = CRED();
 
 	if (error == 0) {
 		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
 		    dsl_dataset_snapshot_sync, &ddsa,
-		    fnvlist_num_pairs(snaps) * 3);
+		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
 	}
 
 	if (suspended != NULL) {
@@ -1323,7 +1621,9 @@
 	if (error != 0)
 		return (error);
 
-	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx);
+	/* NULL cred means no limit check for tmp snapshot */
+	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+	    tx, B_FALSE, 0, NULL);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
@@ -1389,7 +1689,7 @@
 	}
 
 	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
-	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
+	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
 
 	if (needsuspend)
 		zil_resume(cookie);
@@ -1402,7 +1702,7 @@
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
-	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
 
 	/*
 	 * in case we had to change ds_fsid_guid when we opened it,
@@ -1409,11 +1709,56 @@
 	 * sync it out now.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
+	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+	}
+
 	dmu_objset_sync(ds->ds_objset, zio, tx);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_activation_needed[f]) {
+			if (ds->ds_feature_inuse[f])
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f, tx);
+			ds->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
 }
 
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os = ds->ds_objset;
+
+	bplist_iterate(&ds->ds_pending_deadlist,
+	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+
+	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+	dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
 static void
 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 {
@@ -1422,26 +1767,38 @@
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	nvlist_t *propval = fnvlist_alloc();
-	nvlist_t *val = fnvlist_alloc();
+	nvlist_t *val;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
+	 * We use nvlist_alloc() instead of fnvlist_alloc() because the
+	 * latter would allocate the list with NV_UNIQUE_NAME flag.
+	 * As a result, every time a clone name is appended to the list
+	 * it would be (linearly) searched for for a duplicate name.
+	 * We already know that all clone names must be unique and we
+	 * want avoid the quadratic complexity of double-checking that
+	 * because we can have a large number of clones.
+	 */
+	VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
+
+	/*
 	 * There may be missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
-		ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 		    &count));
 	}
-	if (count != ds->ds_phys->ds_num_children - 1)
+	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
 		goto fail;
-	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
-		char buf[ZFS_MAXNAMELEN];
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		dsl_dir_name(clone->ds_dir, buf);
@@ -1456,6 +1813,76 @@
 	nvlist_free(propval);
 }
 
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dsl_dataset_has_resume_receive_state(ds)) {
+		char *str;
+		void *packed;
+		uint8_t *compressed;
+		uint64_t val;
+		nvlist_t *token_nv = fnvlist_alloc();
+		size_t packed_size, compressed_size;
+
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "fromguid", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "object", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "offset", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "bytes", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "toguid", val);
+		}
+		char buf[256];
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+			fnvlist_add_string(token_nv, "toname", buf);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_EMBEDOK) == 0) {
+			fnvlist_add_boolean(token_nv, "embedok");
+		}
+		packed = fnvlist_pack(token_nv, &packed_size);
+		fnvlist_free(token_nv);
+		compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+		compressed_size = gzip_compress(packed, compressed,
+		    packed_size, packed_size, 6);
+
+		zio_cksum_t cksum;
+		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
+
+		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+		for (int i = 0; i < compressed_size; i++) {
+			(void) sprintf(str + i * 2, "%02x", compressed[i]);
+		}
+		str[compressed_size * 2] = '\0';
+		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+		    ZFS_SEND_RESUME_TOKEN_VERSION,
+		    (longlong_t)cksum.zc_word[0],
+		    (longlong_t)packed_size, str);
+		dsl_prop_nvlist_add_string(nv,
+		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+		kmem_free(packed, packed_size);
+		kmem_free(str, compressed_size * 2 + 1);
+		kmem_free(compressed, packed_size);
+		strfree(propval);
+	}
+}
+
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
@@ -1464,20 +1891,26 @@
 
 	ASSERT(dsl_pool_config_held(dp));
 
-	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-	    ds->ds_phys->ds_compressed_bytes);
+	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+	    dsl_dataset_phys(ds)->ds_compressed_bytes);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
-	    ds->ds_phys->ds_uncompressed_bytes);
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
 
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-		    ds->ds_phys->ds_unique_bytes);
+		    dsl_dataset_phys(ds)->ds_unique_bytes);
 		get_clones_stat(ds, nv);
 	} else {
+		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+			char buf[ZFS_MAX_DATASET_NAME_LEN];
+			dsl_dataset_name(ds->ds_prev, buf);
+			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
+		}
+
 		dsl_dir_stats(ds->ds_dir, nv);
 	}
 
@@ -1486,17 +1919,17 @@
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-	    ds->ds_phys->ds_creation_time);
+	    dsl_dataset_phys(ds)->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-	    ds->ds_phys->ds_creation_txg);
+	    dsl_dataset_phys(ds)->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
 	    ds->ds_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
 	    ds->ds_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
-	    ds->ds_phys->ds_guid);
+	    dsl_dataset_phys(ds)->ds_guid);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-	    ds->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    ds->ds_object);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
@@ -1504,13 +1937,13 @@
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		uint64_t written, comp, uncomp;
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		dsl_dataset_t *prev;
 
 		int err = dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err == 0) {
 			err = dsl_dataset_space_written(prev, ds, &written,
 			    &comp, &uncomp);
@@ -1521,6 +1954,32 @@
 			}
 		}
 	}
+
+	if (!dsl_dataset_is_snapshot(ds)) {
+		/*
+		 * A failed "newfs" (e.g. full) resumable receive leaves
+		 * the stats set on this dataset.  Check here for the prop.
+		 */
+		get_receive_resume_stats(ds, nv);
+
+		/*
+		 * A failed incremental resumable receive leaves the
+		 * stats set on our child named "%recv".  Check the child
+		 * for the prop.
+		 */
+		/* 6 extra bytes for /%recv */
+		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+		dsl_dataset_t *recv_ds;
+		dsl_dataset_name(ds, recvname);
+		if (strlcat(recvname, "/", sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+			get_receive_resume_stats(recv_ds, nv);
+			dsl_dataset_rele(recv_ds, FTAG);
+		}
+	}
 }
 
 void
@@ -1529,13 +1988,15 @@
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	ASSERT(dsl_pool_config_held(dp));
 
-	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
-	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
-	stat->dds_guid = ds->ds_phys->ds_guid;
+	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+	stat->dds_inconsistent =
+	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
 	stat->dds_origin[0] = '\0';
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		stat->dds_is_snapshot = B_TRUE;
-		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+		stat->dds_num_clones =
+		    dsl_dataset_phys(ds)->ds_num_children - 1;
 	} else {
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
@@ -1544,7 +2005,8 @@
 			dsl_dataset_t *ods;
 
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+			    FTAG, &ods));
 			dsl_dataset_name(ods, stat->dds_origin);
 			dsl_dataset_rele(ods, FTAG);
 		}
@@ -1562,10 +2024,11 @@
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
+	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
-		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+		*availbytesp +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
@@ -1576,21 +2039,26 @@
 		else
 			*availbytesp = 0;
 	}
-	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
-dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	uint64_t birth;
 
 	ASSERT(dsl_pool_config_held(dp));
-	if (ds->ds_prev == NULL)
+	if (snap == NULL)
 		return (B_FALSE);
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    ds->ds_prev->ds_phys->ds_creation_txg) {
-		objset_t *os, *os_prev;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+		objset_t *os, *os_snap;
 		/*
 		 * It may be that only the ZIL differs, because it was
 		 * reset in the head.  Don't count that as being
@@ -1598,10 +2066,10 @@
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0)
 			return (B_TRUE);
-		if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+		if (dmu_objset_from_ds(snap, &os_snap) != 0)
 			return (B_TRUE);
 		return (bcmp(&os->os_phys->os_meta_dnode,
-		    &os_prev->os_phys->os_meta_dnode,
+		    &os_snap->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
@@ -1639,7 +2107,7 @@
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
 	if (dsl_dir_namelen(hds->ds_dir) + 1 +
-	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
+	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
 		error = SET_ERROR(ENAMETOOLONG);
 
 	return (error);
@@ -1696,11 +2164,13 @@
 	spa_history_log_internal_ds(ds, "rename", tx,
 	    "-> @%s", ddrsa->ddrsa_newsnapname);
 
-	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+	    B_FALSE));
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
 	mutex_exit(&ds->ds_lock);
-	VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+	VERIFY0(zap_add(dp->dp_meta_objset,
+	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 
 #ifdef __FreeBSD__
@@ -1753,45 +2223,118 @@
 	ddrsa.ddrsa_recursive = recursive;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
-	    dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
+	    dsl_dataset_rename_snapshot_sync, &ddrsa,
+	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset.  We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here.  We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
 static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+	boolean_t held;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	if (owner != NULL) {
+		VERIFY3P(ds->ds_owner, ==, owner);
+		dsl_dataset_long_rele(ds, owner);
+	}
+
+	held = dsl_dataset_long_held(ds);
+
+	if (owner != NULL)
+		dsl_dataset_long_hold(ds, owner);
+
+	if (held)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+typedef struct dsl_dataset_rollback_arg {
+	const char *ddra_fsname;
+	void *ddra_owner;
+	nvlist_t *ddra_result;
+} dsl_dataset_rollback_arg_t;
+
+static int
 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
-	const char *fsname = arg;
+	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int64_t unused_refres_delta;
 	int error;
 
-	error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* must not be a snapshot */
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must have a most recent snapshot */
-	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
-	if (dsl_dataset_long_held(ds)) {
+	/*
+	 * No rollback to a snapshot created in the current txg, because
+	 * the rollback may dirty the dataset and create blocks that are
+	 * not reachable from the rootbp while having a birth txg that
+	 * falls into the snapshot's range.
+	 */
+	if (dmu_tx_is_syncing(tx) &&
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
 		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EBUSY));
+		return (SET_ERROR(EAGAIN));
 	}
 
+	/* must not have any bookmarks after the most recent snapshot */
+	nvlist_t *proprequest = fnvlist_alloc();
+	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+	nvlist_t *bookmarks = fnvlist_alloc();
+	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
+	fnvlist_free(proprequest);
+	if (error != 0)
+		return (error);
+	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
+		nvlist_t *valuenv =
+		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
+		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
+		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
+		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+			fnvlist_free(bookmarks);
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EEXIST));
+		}
+	}
+	fnvlist_free(bookmarks);
+
+	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
 	/*
 	 * Check if the snap we are rolling back to uses more than
 	 * the refquota.
 	 */
 	if (ds->ds_quota != 0 &&
-	    ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
@@ -1804,7 +2347,7 @@
 	 * this space, but the freeing happens over many txg's.
 	 */
 	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
-	    ds->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
@@ -1820,13 +2363,17 @@
 static void
 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
-	const char *fsname = arg;
+	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds, *clone;
 	uint64_t cloneobj;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
-	VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
+	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
 
+	dsl_dataset_name(ds->ds_prev, namebuf);
+	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
 
@@ -1841,11 +2388,31 @@
 	dsl_dataset_rele(ds, FTAG);
 }
 
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ *   succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted.  See
+ * notes above zfs_suspend_fs() for further details.
+ */
 int
-dsl_dataset_rollback(const char *fsname)
+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
 {
+	dsl_dataset_rollback_arg_t ddra;
+
+	ddra.ddra_fsname = fsname;
+	ddra.ddra_owner = owner;
+	ddra.ddra_result = result;
+
 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
-	    dsl_dataset_rollback_sync, (void *)fsname, 1));
+	    dsl_dataset_rollback_sync, &ddra,
+	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 struct promotenode {
@@ -1860,6 +2427,7 @@
 	dsl_dataset_t *origin_origin; /* origin of the origin */
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
+	cred_t *cr;
 } dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -1877,6 +2445,8 @@
 	dsl_dataset_t *origin_ds;
 	int err;
 	uint64_t unused;
+	uint64_t ss_mv_cnt;
+	size_t max_snap_len;
 
 	err = promote_hold(ddpa, dp, FTAG);
 	if (err != 0)
@@ -1883,8 +2453,9 @@
 		return (err);
 
 	hds = ddpa->ddpa_clone;
+	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
 
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
 		promote_rele(ddpa, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
@@ -1903,9 +2474,10 @@
 
 	/* compute origin's new unique space */
 	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
 	    &ddpa->unique, &unused, &unused);
 
 	/*
@@ -1923,14 +2495,17 @@
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
-	ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
-	ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-	ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+	ss_mv_cnt = 0;
+	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
+		ss_mv_cnt++;
+
 		/*
 		 * If there are long holds, we won't be able to evict
 		 * the objset.
@@ -1942,6 +2517,10 @@
 
 		/* Check that the snapshot name does not conflict */
 		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (strlen(ds->ds_snapname) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
 			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
@@ -1952,7 +2531,7 @@
 			goto out;
 
 		/* The very first snapshot does not have a deadlist */
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
+		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
 			continue;
 
 		dsl_deadlist_space(&ds->ds_deadlist,
@@ -1967,15 +2546,18 @@
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (ddpa->origin_origin) {
-		ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
-		ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+		ddpa->used -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+		ddpa->comp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
 		ddpa->uncomp -=
-		    ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
+		    dsl_dataset_phys(ddpa->origin_origin)->
+		    ds_uncompressed_bytes;
 	}
 
-	/* Check that there is enough space here */
+	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-	    ddpa->used);
+	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
 	if (err != 0)
 		goto out;
 
@@ -1985,7 +2567,7 @@
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
-	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
@@ -2007,9 +2589,11 @@
 			goto out;
 		ddpa->cloneusedsnap += space;
 	}
-	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+	    DD_FLAG_USED_BREAKDOWN) {
 		err = snaplist_space(&ddpa->origin_snaps,
-		    origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
+		    &ddpa->originusedsnap);
 		if (err != 0)
 			goto out;
 	}
@@ -2032,11 +2616,14 @@
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	char *oldname, *newname;
+#endif
 
 	VERIFY0(promote_hold(ddpa, dp, FTAG));
 	hds = ddpa->ddpa_clone;
 
-	ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
@@ -2054,49 +2641,59 @@
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
-	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
 	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
-	if (origin_ds->ds_phys->ds_next_clones_obj) {
+	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
 		dsl_dataset_remove_from_next_clones(origin_ds,
 		    snap->ds->ds_object, tx);
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    origin_ds->ds_phys->ds_next_clones_obj,
+		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
-	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
-	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
 	origin_head->ds_dir->dd_origin_txg =
-	    origin_ds->ds_phys->ds_creation_txg;
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    odd->dd_phys->dd_clones, hds->ds_object, tx));
+		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    hds->ds_object, tx));
 
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    origin_head->ds_object, tx));
-		if (dd->dd_phys->dd_clones == 0) {
-			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
-			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+		if (dsl_dir_phys(dd)->dd_clones == 0) {
+			dsl_dir_phys(dd)->dd_clones =
+			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+			    DMU_OT_NONE, 0, tx);
 		}
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
 	}
 
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
+	mutex_enter(&spa_namespace_lock);
+
+	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+#endif
+
 	/* move snapshots to this dir */
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
@@ -2115,28 +2712,36 @@
 		/* move snap name entry */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		VERIFY0(dsl_dataset_snap_remove(origin_head,
-		    ds->ds_snapname, tx));
+		    ds->ds_snapname, tx, B_TRUE));
 		VERIFY0(zap_add(dp->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
+		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
-		ds->ds_phys->ds_dir_obj = dd->dd_object;
+		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_rele(ds->ds_dir, ds);
 		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
+#if defined(__FreeBSD__) && defined(_KERNEL)
+		dsl_dataset_name(ds, newname);
+		zfsvfs_update_fromname(oldname, newname);
+		zvol_rename_minors(oldname, newname);
+#endif
+
 		/* move any clone references */
-		if (ds->ds_phys->ds_next_clones_obj &&
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj);
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				dsl_dataset_t *cnds;
@@ -2152,12 +2757,13 @@
 
 				VERIFY0(dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
-				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+				o = dsl_dir_phys(cnds->ds_dir)->
+				    dd_head_dataset_obj;
 
 				VERIFY0(zap_remove_int(dp->dp_meta_objset,
-				    odd->dd_phys->dd_clones, o, tx));
+				    dsl_dir_phys(odd)->dd_clones, o, tx));
 				VERIFY0(zap_add_int(dp->dp_meta_objset,
-				    dd->dd_phys->dd_clones, o, tx));
+				    dsl_dir_phys(dd)->dd_clones, o, tx));
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
@@ -2166,6 +2772,12 @@
 		ASSERT(!dsl_prop_hascb(ds));
 	}
 
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	mutex_exit(&spa_namespace_lock);
+
+	kmem_free(newname, MAXPATHLEN);
+	kmem_free(oldname, MAXPATHLEN);
+#endif
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
@@ -2174,7 +2786,7 @@
 	 */
 
 	delta = ddpa->cloneusedsnap -
-	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
 	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
@@ -2182,7 +2794,7 @@
 	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
 	delta = ddpa->originusedsnap -
-	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
 	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
@@ -2189,7 +2801,7 @@
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
 	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
-	origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
+	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, "");
@@ -2224,12 +2836,12 @@
 			return (err);
 
 		if (first_obj == 0)
-			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
 		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
-		obj = ds->ds_phys->ds_prev_snap_obj;
+		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
@@ -2279,13 +2891,13 @@
 		return (error);
 	dd = ddpa->ddpa_clone->ds_dir;
 
-	if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
+	if (ddpa->ddpa_clone->ds_is_snapshot ||
 	    !dsl_dir_is_clone(dd)) {
 		dsl_dataset_rele(ddpa->ddpa_clone, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
-	error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
 	    &ddpa->shared_snaps, tag);
 	if (error != 0)
 		goto out;
@@ -2296,16 +2908,16 @@
 		goto out;
 
 	snap = list_head(&ddpa->shared_snaps);
-	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-	error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
-	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
 	    &ddpa->origin_snaps, tag);
 	if (error != 0)
 		goto out;
 
-	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
 		error = dsl_dataset_hold_obj(dp,
-		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
+		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
 		    tag, &ddpa->origin_origin);
 		if (error != 0)
 			goto out;
@@ -2331,7 +2943,7 @@
  * Promote a clone.
  *
  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
- * in with the name.  (It must be at least MAXNAMELEN bytes long.)
+ * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
@@ -2349,7 +2961,8 @@
 	if (error != 0)
 		return (error);
 	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
-	    dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+	    &numsnaps);
 	dmu_objset_rele(os, FTAG);
 	if (error != 0)
 		return (error);
@@ -2356,31 +2969,37 @@
 
 	ddpa.ddpa_clonename = name;
 	ddpa.err_ds = conflsnap;
+	ddpa.cr = CRED();
 
 	return (dsl_sync_task(name, dsl_dataset_promote_check,
-	    dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
+	    dsl_dataset_promote_sync, &ddpa,
+	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force)
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
+	/*
+	 * "slack" factor for received datasets with refquota set on them.
+	 * See the bottom of this function for details on its use.
+	 */
+	uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
 	int64_t unused_refres_delta;
 
 	/* they should both be heads */
-	if (dsl_dataset_is_snapshot(clone) ||
-	    dsl_dataset_is_snapshot(origin_head))
+	if (clone->ds_is_snapshot ||
+	    origin_head->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
-	/* the branch point should be just before them */
-	if (clone->ds_prev != origin_head->ds_prev)
+	/* if we are not forcing, the branch point should be just before them */
+	if (!force && clone->ds_prev != origin_head->ds_prev)
 		return (SET_ERROR(EINVAL));
 
 	/* clone should be the clone (unless they are unrelated) */
 	if (clone->ds_prev != NULL &&
 	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
-	    origin_head->ds_object !=
-	    clone->ds_prev->ds_phys->ds_next_snap_obj)
+	    origin_head->ds_dir != clone->ds_prev->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* the clone should be a child of the origin */
@@ -2388,19 +3007,20 @@
 		return (SET_ERROR(EINVAL));
 
 	/* origin_head shouldn't be modified unless 'force' */
-	if (!force && dsl_dataset_modified_since_lastsnap(origin_head))
+	if (!force &&
+	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
 		return (SET_ERROR(ETXTBSY));
 
 	/* origin_head should have no long holds (e.g. is not mounted) */
-	if (dsl_dataset_long_held(origin_head))
+	if (dsl_dataset_handoff_check(origin_head, owner, tx))
 		return (SET_ERROR(EBUSY));
 
 	/* check amount of any unconsumed refreservation */
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    origin_head->ds_phys->ds_unique_bytes) -
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    clone->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
@@ -2407,9 +3027,22 @@
 	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
-	/* clone can't be over the head's refquota */
+	/*
+	 * The clone can't be too much over the head's refquota.
+	 *
+	 * To ensure that the entire refquota can be used, we allow one
+	 * transaction to exceed the the refquota.  Therefore, this check
+	 * needs to also allow for the space referenced to be more than the
+	 * refquota.  The maximum amount of space that one transaction can use
+	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
+	 * overage ensures that we are able to receive a filesystem that
+	 * exceeds the refquota on the source system.
+	 *
+	 * So that overage is the refquota_slack we use below.
+	 */
 	if (origin_head->ds_quota != 0 &&
-	    clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
+	    dsl_dataset_phys(clone)->ds_referenced_bytes >
+	    origin_head->ds_quota + refquota_slack)
 		return (SET_ERROR(EDQUOT));
 
 	return (0);
@@ -2423,9 +3056,49 @@
 	int64_t unused_refres_delta;
 
 	ASSERT(clone->ds_reserved == 0);
+	/*
+	 * NOTE: On DEBUG kernels there could be a race between this and
+	 * the check function if spa_asize_inflation is adjusted...
+	 */
 	ASSERT(origin_head->ds_quota == 0 ||
-	    clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
+	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+	    DMU_MAX_ACCESS * spa_asize_inflation);
+	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
+	/*
+	 * Swap per-dataset feature flags.
+	 */
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (!(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET)) {
+			ASSERT(!clone->ds_feature_inuse[f]);
+			ASSERT(!origin_head->ds_feature_inuse[f]);
+			continue;
+		}
+
+		boolean_t clone_inuse = clone->ds_feature_inuse[f];
+		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+		if (clone_inuse) {
+			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_deactivate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (clone_inuse) {
+			dsl_dataset_activate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_TRUE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_activate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
+
 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
@@ -2441,9 +3114,9 @@
 
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    origin_head->ds_phys->ds_unique_bytes) -
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
-	    clone->ds_phys->ds_unique_bytes);
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/*
 	 * Reset origin's unique bytes, if it exists.
@@ -2454,16 +3127,21 @@
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_deadlist_space_range(&clone->ds_deadlist,
-		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
 	{
+		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
 		blkptr_t tmp;
-		tmp = origin_head->ds_phys->ds_bp;
-		origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
-		clone->ds_phys->ds_bp = tmp;
+		tmp = dsl_dataset_phys(origin_head)->ds_bp;
+		dsl_dataset_phys(origin_head)->ds_bp =
+		    dsl_dataset_phys(clone)->ds_bp;
+		dsl_dataset_phys(clone)->ds_bp = tmp;
+		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+		rrw_exit(&clone->ds_bp_rwlock, FTAG);
 	}
 
 	/* set dd_*_bytes */
@@ -2472,7 +3150,7 @@
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
-		ASSERT3U(clone->ds_dir->dd_phys->
+		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
 		dsl_deadlist_space(&clone->ds_deadlist,
@@ -2480,13 +3158,18 @@
 		dsl_deadlist_space(&origin_head->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
-		dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
-		    (origin_head->ds_phys->ds_referenced_bytes + odl_used);
-		dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
-		    (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
-		duncomp = clone->ds_phys->ds_uncompressed_bytes +
+		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+		    cdl_used -
+		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+		    odl_used);
+		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+		    cdl_comp -
+		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+		    odl_comp);
+		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
 		    cdl_uncomp -
-		    (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+		    odl_uncomp);
 
 		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
@@ -2506,18 +3189,18 @@
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
-		    DD_USED_HEAD, DD_USED_SNAP, tx);
+		    DD_USED_HEAD, DD_USED_SNAP, NULL);
 	}
 
 	/* swap ds_*_bytes */
-	SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
-	    clone->ds_phys->ds_referenced_bytes);
-	SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
-	    clone->ds_phys->ds_compressed_bytes);
-	SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
-	    clone->ds_phys->ds_uncompressed_bytes);
-	SWITCH64(origin_head->ds_phys->ds_unique_bytes,
-	    clone->ds_phys->ds_unique_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+	    dsl_dataset_phys(clone)->ds_referenced_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+	    dsl_dataset_phys(clone)->ds_compressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
 	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
@@ -2528,12 +3211,12 @@
 	 */
 	dsl_deadlist_close(&clone->ds_deadlist);
 	dsl_deadlist_close(&origin_head->ds_deadlist);
-	SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
-	    clone->ds_phys->ds_deadlist_obj);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
-	    clone->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
-	    origin_head->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
@@ -2584,10 +3267,11 @@
 	/*
 	 * Make a space adjustment for reserved bytes.
 	 */
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
 		ASSERT3U(*used, >=,
-		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*used -=
+		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*ref_rsrv =
 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
@@ -2602,9 +3286,10 @@
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+	    ds->ds_quota) {
 		if (inflight > 0 ||
-		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
+		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
 			error = SET_ERROR(ERESTART);
 		else
 			error = SET_ERROR(EDQUOT);
@@ -2638,7 +3323,7 @@
 	if (error != 0)
 		return (error);
 
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
@@ -2656,7 +3341,7 @@
 		return (0);
 	}
 
-	if (newval < ds->ds_phys->ds_referenced_bytes ||
+	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
 	    newval < ds->ds_reserved) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
@@ -2702,7 +3387,7 @@
 	ddsqra.ddsqra_value = refquota;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-	    dsl_dataset_set_refquota_sync, &ddsqra, 0));
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static int
@@ -2721,7 +3406,7 @@
 	if (error != 0)
 		return (error);
 
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
@@ -2746,7 +3431,7 @@
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
 		dsl_dataset_recalc_head_uniq(ds);
-	unique = ds->ds_phys->ds_unique_bytes;
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
@@ -2783,7 +3468,7 @@
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-	unique = ds->ds_phys->ds_unique_bytes;
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(newval - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = newval;
@@ -2817,7 +3502,8 @@
 	ddsqra.ddsqra_value = refreservation;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-	    dsl_dataset_set_refreservation_sync, &ddsqra, 0));
+	    dsl_dataset_set_refreservation_sync, &ddsqra,
+	    0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
@@ -2848,16 +3534,16 @@
 	ASSERT(dsl_pool_config_held(dp));
 
 	*usedp = 0;
-	*usedp += new->ds_phys->ds_referenced_bytes;
-	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
+	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
 
 	*compp = 0;
-	*compp += new->ds_phys->ds_compressed_bytes;
-	*compp -= oldsnap->ds_phys->ds_compressed_bytes;
+	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
 
 	*uncompp = 0;
-	*uncompp += new->ds_phys->ds_uncompressed_bytes;
-	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
+	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
 
 	snapobj = new->ds_object;
 	while (snapobj != oldsnap->ds_object) {
@@ -2872,8 +3558,8 @@
 				break;
 		}
 
-		if (snap->ds_phys->ds_prev_snap_txg ==
-		    oldsnap->ds_phys->ds_creation_txg) {
+		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
 			/*
 			 * The blocks in the deadlist can not be born after
 			 * ds_prev_snap_txg, so get the whole deadlist space,
@@ -2886,7 +3572,7 @@
 			    &used, &comp, &uncomp);
 		} else {
 			dsl_deadlist_space_range(&snap->ds_deadlist,
-			    0, oldsnap->ds_phys->ds_creation_txg,
+			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
 			    &used, &comp, &uncomp);
 		}
 		*usedp += used;
@@ -2898,7 +3584,7 @@
 		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
 		 * was not a snapshot of/before new.
 		 */
-		snapobj = snap->ds_phys->ds_prev_snap_obj;
+		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
@@ -2934,8 +3620,8 @@
 	uint64_t snapobj;
 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
-	ASSERT(dsl_dataset_is_snapshot(firstsnap));
-	ASSERT(dsl_dataset_is_snapshot(lastsnap));
+	ASSERT(firstsnap->ds_is_snapshot);
+	ASSERT(lastsnap->ds_is_snapshot);
 
 	/*
 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
@@ -2942,13 +3628,13 @@
 	 * is before lastsnap.
 	 */
 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
-	    firstsnap->ds_phys->ds_creation_txg >
-	    lastsnap->ds_phys->ds_creation_txg)
+	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
+	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
 		return (SET_ERROR(EINVAL));
 
 	*usedp = *compp = *uncompp = 0;
 
-	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
+	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
 		uint64_t used, comp, uncomp;
@@ -2958,13 +3644,13 @@
 			break;
 
 		dsl_deadlist_space_range(&ds->ds_deadlist,
-		    firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
-		snapobj = ds->ds_phys->ds_prev_snap_obj;
+		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
@@ -2977,9 +3663,12 @@
  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
  * filesystem.  Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
  */
 boolean_t
-dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+    uint64_t earlier_txg)
 {
 	dsl_pool_t *dp = later->ds_dir->dd_pool;
 	int error;
@@ -2986,9 +3675,13 @@
 	boolean_t ret;
 
 	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
 
-	if (earlier->ds_phys->ds_creation_txg >=
-	    later->ds_phys->ds_creation_txg)
+	if (earlier_txg == 0)
+		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+	if (later->ds_is_snapshot &&
+	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
 		return (B_FALSE);
 
 	if (later->ds_dir == earlier->ds_dir)
@@ -2996,14 +3689,38 @@
 	if (!dsl_dir_is_clone(later->ds_dir))
 		return (B_FALSE);
 
-	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
 		return (B_TRUE);
 	dsl_dataset_t *origin;
 	error = dsl_dataset_hold_obj(dp,
-	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
 	if (error != 0)
 		return (B_FALSE);
-	ret = dsl_dataset_is_before(origin, earlier);
+	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
 	dsl_dataset_rele(origin, FTAG);
 	return (ret);
 }
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(ds->ds_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_is_zapified(ds) &&
+	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dsl_dataset.h>
@@ -120,6 +122,8 @@
 	void *cookie = NULL;
 	dsl_deadlist_entry_t *dle;
 
+	dl->dl_os = NULL;
+
 	if (dl->dl_oldfmt) {
 		dl->dl_oldfmt = B_FALSE;
 		bpobj_close(&dl->dl_bpobj);
@@ -144,7 +148,7 @@
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-		return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
 	    sizeof (dsl_deadlist_phys_t), tx));
 }
@@ -181,7 +185,7 @@
 {
 	if (dle->dle_bpobj.bpo_object ==
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -255,7 +259,7 @@
 
 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 	dle->dle_mintxg = mintxg;
-	obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
@@ -309,8 +313,9 @@
 	while (mrs_obj != 0) {
 		dsl_dataset_t *ds;
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
-		dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
-		mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+		dsl_deadlist_add_key(&dl,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_deadlist_close(&dl);
@@ -339,7 +344,7 @@
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
-		obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -165,10 +165,10 @@
 
 	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
 
-	zapobj = dd->dd_phys->dd_deleg_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
@@ -209,7 +209,7 @@
 	uint64_t zapobj;
 
 	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
-	zapobj = dd->dd_phys->dd_deleg_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dsl_dir_rele(dd, FTAG);
 		return;
@@ -283,7 +283,7 @@
 
 	return (dsl_sync_task(ddname, dsl_deleg_check,
 	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-	    &dda, fnvlist_num_pairs(nvp)));
+	    &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
@@ -331,16 +331,16 @@
 		zap_attribute_t baseza;
 		nvlist_t *sp_nvp;
 		uint64_t n;
-		char source[MAXNAMELEN];
+		char source[ZFS_MAX_DATASET_NAME_LEN];
 
-		if (dd->dd_phys->dd_deleg_zapobj == 0 ||
-		    zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
-		    n == 0)
+		if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+		    zap_count(mos,
+		    dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
 			continue;
 
 		sp_nvp = fnvlist_alloc();
 		for (zap_cursor_init(&basezc, mos,
-		    dd->dd_phys->dd_deleg_zapobj);
+		    dsl_dir_phys(dd)->dd_deleg_zapobj);
 		    zap_cursor_retrieve(&basezc, &baseza) == 0;
 		    zap_cursor_advance(&basezc)) {
 			zap_cursor_t zc;
@@ -562,7 +562,7 @@
 	    SPA_VERSION_DELEGATED_PERMS)
 		return (SET_ERROR(EPERM));
 
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		/*
 		 * Snapshots are treated as descendents only,
 		 * local permissions do not apply.
@@ -595,7 +595,7 @@
 			if (!zoned)
 				break;
 		}
-		zapobj = dd->dd_phys->dd_deleg_zapobj;
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (zapobj == 0)
 			continue;
@@ -674,7 +674,7 @@
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t jumpobj, pjumpobj;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	char whokey[ZFS_MAX_DELEG_NAME];
@@ -687,7 +687,7 @@
 
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
@@ -725,7 +725,7 @@
 		return;
 
 	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
-		uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+		uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (pzapobj == 0)
 			continue;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -39,6 +41,7 @@
 #include <sys/zfeature.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
 
 typedef struct dmu_snapshots_destroy_arg {
 	nvlist_t *dsda_snaps;
@@ -47,13 +50,10 @@
 	nvlist_t *dsda_errlist;
 } dmu_snapshots_destroy_arg_t;
 
-/*
- * ds must be owned.
- */
-static int
+int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
-	if (!dsl_dataset_is_snapshot(ds))
+	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (dsl_dataset_long_held(ds))
@@ -80,7 +80,7 @@
 	/*
 	 * Can't delete a branch point.
 	 */
-	if (ds->ds_phys->ds_num_children > 1)
+	if (dsl_dataset_phys(ds)->ds_num_children > 1)
 		return (SET_ERROR(EEXIST));
 
 	return (0);
@@ -147,12 +147,14 @@
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 
-	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
 		    bp->blk_birth >
-		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
-			poa->ds_prev->ds_phys->ds_unique_bytes +=
+		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
 		}
 	} else {
@@ -183,7 +185,7 @@
 	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 	    process_old_cb, &poa, tx));
 	VERIFY0(zio_wait(poa.pio));
-	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	/* change snapused */
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
@@ -192,12 +194,14 @@
 	/* swap next's deadlist to our deadlist */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_close(&ds_next->ds_deadlist);
-	deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
-	ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
-	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-	    ds_next->ds_phys->ds_deadlist_obj);
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
 }
 
 static void
@@ -212,10 +216,10 @@
 	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
 	 * doesn't matter.
 	 */
-	if (ds->ds_dir->dd_phys->dd_clones == 0)
+	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
 		return;
 
-	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
@@ -243,19 +247,22 @@
 	uint64_t obj;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(refcount_is_zero(&ds->ds_longholds));
 
 	if (defer &&
-	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+	    (ds->ds_userrefs > 0 ||
+	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
 		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
 		return;
 	}
 
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
 	/* We need to log before removing it from the namespace. */
 	spa_history_log_internal_ds(ds, "destroy", tx, "");
@@ -264,26 +271,34 @@
 
 	obj = ds->ds_object;
 
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		ASSERT3P(ds->ds_prev, ==, NULL);
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
 		after_branch_point =
-		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
+		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
-		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
+		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
-			if (ds->ds_phys->ds_next_snap_obj != 0) {
+			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 				VERIFY0(zap_add_int(mos,
-				    ds_prev->ds_phys->ds_next_clones_obj,
-				    ds->ds_phys->ds_next_snap_obj, tx));
+				    dsl_dataset_phys(ds_prev)->
+				    ds_next_clones_obj,
+				    dsl_dataset_phys(ds)->ds_next_snap_obj,
+				    tx));
 			}
 		}
 		if (!after_branch_point) {
-			ds_prev->ds_phys->ds_next_snap_obj =
-			    ds->ds_phys->ds_next_snap_obj;
+			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 		}
 	}
 
@@ -292,18 +307,18 @@
 	uint64_t used = 0, comp = 0, uncomp = 0;
 
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-	ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
 
-	old_unique = ds_next->ds_phys->ds_unique_bytes;
+	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-	ds_next->ds_phys->ds_prev_snap_obj =
-	    ds->ds_phys->ds_prev_snap_obj;
-	ds_next->ds_phys->ds_prev_snap_txg =
-	    ds->ds_phys->ds_prev_snap_txg;
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-	    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
 
 	if (ds_next->ds_deadlist.dl_oldfmt) {
 		process_old_deadlist(ds, ds_prev, ds_next,
@@ -312,15 +327,15 @@
 		/* Adjust prev's unique space. */
 		if (ds_prev && !after_branch_point) {
 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
-			    ds_prev->ds_phys->ds_prev_snap_txg,
-			    ds->ds_phys->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 			    &used, &comp, &uncomp);
-			ds_prev->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
 		}
 
 		/* Adjust snapused. */
 		dsl_deadlist_space_range(&ds_next->ds_deadlist,
-		    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 		    -used, -comp, -uncomp, tx);
@@ -327,7 +342,7 @@
 
 		/* Move blocks to be freed to pool's free list. */
 		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-		    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    tx);
 		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 		    DD_USED_HEAD, used, comp, uncomp, tx);
@@ -334,18 +349,18 @@
 
 		/* Merge our deadlist into next's and free it. */
 		dsl_deadlist_merge(&ds_next->ds_deadlist,
-		    ds->ds_phys->ds_deadlist_obj, tx);
+		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	}
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = 0;
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	/* Collapse range in clone heads */
 	dsl_dataset_remove_clones_key(ds,
-	    ds->ds_phys->ds_creation_txg, tx);
+	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 
-	if (dsl_dataset_is_snapshot(ds_next)) {
+	if (ds_next->ds_is_snapshot) {
 		dsl_dataset_t *ds_nextnext;
 
 		/*
@@ -358,12 +373,13 @@
 		 * deadlist).
 		 */
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+		    FTAG, &ds_nextnext));
 		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-		    ds->ds_phys->ds_prev_snap_txg,
-		    ds->ds_phys->ds_creation_txg,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    dsl_dataset_phys(ds)->ds_creation_txg,
 		    &used, &comp, &uncomp);
-		ds_next->ds_phys->ds_unique_bytes += used;
+		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
 		dsl_dataset_rele(ds_nextnext, FTAG);
 		ASSERT3P(ds_next->ds_prev, ==, NULL);
 
@@ -370,9 +386,9 @@
 		/* Collapse range in this head. */
 		dsl_dataset_t *hds;
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
 		dsl_deadlist_remove_key(&hds->ds_deadlist,
-		    ds->ds_phys->ds_creation_txg, tx);
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 		dsl_dataset_rele(hds, FTAG);
 
 	} else {
@@ -381,7 +397,7 @@
 		ds_next->ds_prev = NULL;
 		if (ds_prev) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds_next, &ds_next->ds_prev));
 		}
 
@@ -395,7 +411,7 @@
 		if (old_unique < ds_next->ds_reserved) {
 			int64_t mrsdelta;
 			uint64_t new_unique =
-			    ds_next->ds_phys->ds_unique_bytes;
+			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 			ASSERT(old_unique <= new_unique);
 			mrsdelta = MIN(new_unique - old_unique,
@@ -417,9 +433,9 @@
 
 	/* remove from snapshot namespace */
 	dsl_dataset_t *ds_head;
-	ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
 	VERIFY0(dsl_dataset_hold_obj(dp,
-	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
 	VERIFY0(dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 	{
@@ -431,7 +447,7 @@
 		ASSERT3U(val, ==, obj);
 	}
 #endif
-	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
 	dsl_dataset_rele(ds_head, FTAG);
 
 	if (ds_prev != NULL)
@@ -439,20 +455,23 @@
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		uint64_t count;
 		ASSERT0(zap_count(mos,
-		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+		    count == 0);
 		VERIFY0(dmu_object_free(mos,
-		    ds->ds_phys->ds_next_clones_obj, tx));
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
 	}
-	if (ds->ds_phys->ds_props_obj != 0)
-		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-	if (ds->ds_phys->ds_userrefs_obj != 0)
-		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+		    tx));
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    tx));
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
-	VERIFY0(dmu_object_free(mos, obj, tx));
+	dmu_object_free_zapified(mos, obj, tx);
 }
 
 static void
@@ -506,7 +525,7 @@
 
 	error = dsl_sync_task(nvpair_name(pair),
 	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
-	    &dsda, 0);
+	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
 	fnvlist_free(dsda.dsda_successful_snaps);
 
 	return (error);
@@ -534,12 +553,12 @@
 /* ARGSUSED */
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
-	if (bp == NULL)
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -551,7 +570,8 @@
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(bp->blk_birth, >,
+		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
@@ -573,9 +593,10 @@
 	ka.ds = ds;
 	ka.tx = tx;
 	VERIFY0(traverse_dataset(ds,
-	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
 	    kill_blkptr, &ka));
-	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
 }
 
 typedef struct dsl_destroy_head_arg {
@@ -589,7 +610,8 @@
 	uint64_t count;
 	objset_t *mos;
 
-	if (dsl_dataset_is_snapshot(ds))
+	ASSERT(!ds->ds_is_snapshot);
+	if (ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (refcount_count(&ds->ds_longholds) != expected_holds)
@@ -603,7 +625,7 @@
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
 		return (SET_ERROR(EBUSY));
 
 	/*
@@ -610,7 +632,7 @@
 	 * Can't delete if there are children of this fs.
 	 */
 	error = zap_count(mos,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
 	if (error != 0)
 		return (error);
 	if (count != 0)
@@ -617,7 +639,7 @@
 		return (SET_ERROR(EEXIST));
 
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0) {
 		/* We need to remove the origin snapshot as well. */
 		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
@@ -655,27 +677,39 @@
 
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 
-	ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	/*
+	 * Decrement the filesystem count for all parent filesystems.
+	 *
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We never count this temporary
+	 * clone, whose name begins with a '%'.
+	 */
+	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+	/*
 	 * Remove our reservation. The impl() routine avoids setting the
 	 * actual property, which would require the (already destroyed) ds.
 	 */
 	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 
-	ASSERT0(dd->dd_phys->dd_used_bytes);
-	ASSERT0(dd->dd_phys->dd_reserved);
+	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
 	for (t = 0; t < DD_USED_NUM; t++)
-		ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
 
-	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-	VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
 	VERIFY0(zap_remove(mos,
-	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
 
 	dsl_dir_rele(dd, FTAG);
-	VERIFY0(dmu_object_free(mos, ddobj, tx));
+	dmu_object_free_zapified(mos, ddobj, tx);
 }
 
 void
@@ -686,10 +720,12 @@
 	uint64_t obj, ddobj, prevobj = 0;
 	boolean_t rmorigin;
 
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
-	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/* We need to log before removing it from the namespace. */
@@ -697,10 +733,10 @@
 
 	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
 	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0);
 
-	/* Remove our reservation */
+	/* Remove our reservation. */
 	if (ds->ds_reserved != 0) {
 		dsl_dataset_set_refreservation_sync_impl(ds,
 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
@@ -708,30 +744,34 @@
 		ASSERT0(ds->ds_reserved);
 	}
 
+	obj = ds->ds_object;
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+
 	dsl_scan_ds_destroyed(ds, tx);
 
-	obj = ds->ds_object;
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		/* This is a clone */
 		ASSERT(ds->ds_prev != NULL);
-		ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
-		ASSERT0(ds->ds_phys->ds_next_snap_obj);
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+		    obj);
+		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
 
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    obj, tx);
 		}
 
-		ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
-		ds->ds_prev->ds_phys->ds_num_children--;
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
 	}
 
-	zfeature_info_t *async_destroy =
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
-	objset_t *os;
-
 	/*
 	 * Destroy the deadlist.  Unless it's a clone, the
 	 * deadlist should be empty.  (If it's a clone, it's
@@ -738,13 +778,14 @@
 	 * safe to ignore the deadlist contents.)
 	 */
 	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_deadlist_obj = 0;
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
+	objset_t *os;
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
-	if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		old_synchronous_dataset_destroy(ds, tx);
 	} else {
 		/*
@@ -755,10 +796,11 @@
 
 		zil_destroy_sync(dmu_objset_zil(os), tx);
 
-		if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+		if (!spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_ASYNC_DESTROY)) {
 			dsl_scan_t *scn = dp->dp_scan;
-
-			spa_feature_incr(dp->dp_spa, async_destroy, tx);
+			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+			    tx);
 			dp->dp_bptree_obj = bptree_alloc(mos, tx);
 			VERIFY0(zap_add(mos,
 			    DMU_POOL_DIRECTORY_OBJECT,
@@ -768,16 +810,19 @@
 			scn->scn_async_destroying = B_TRUE;
 		}
 
-		used = ds->ds_dir->dd_phys->dd_used_bytes;
-		comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
-		uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
 
 		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    ds->ds_phys->ds_unique_bytes == used);
+		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
 
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 		bptree_add(mos, dp->dp_bptree_obj,
-		    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+		    &dsl_dataset_phys(ds)->ds_bp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    used, comp, uncomp, tx);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    -used, -comp, -uncomp, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
@@ -787,7 +832,7 @@
 	if (ds->ds_prev != NULL) {
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			VERIFY0(zap_remove_int(mos,
-			    ds->ds_prev->ds_dir->dd_phys->dd_clones,
+			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
 			    ds->ds_object, tx));
 		}
 		prevobj = ds->ds_prev->ds_object;
@@ -806,19 +851,25 @@
 
 	/* Erase the link in the dir */
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-	ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
 	ddobj = ds->ds_dir->dd_object;
-	ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-	VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
 
+	if (ds->ds_bookmarks != 0) {
+		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+	}
+
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
-	ASSERT0(ds->ds_phys->ds_next_clones_obj);
-	ASSERT0(ds->ds_phys->ds_props_obj);
-	ASSERT0(ds->ds_phys->ds_userrefs_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
-	VERIFY0(dmu_object_free(mos, obj, tx));
+	dmu_object_free_zapified(mos, obj, tx);
 
 	dsl_dir_destroy_sync(ddobj, tx);
 
@@ -853,7 +904,7 @@
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
 	dsl_dataset_rele(ds, FTAG);
@@ -874,8 +925,7 @@
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
-	isenabled = spa_feature_is_enabled(spa,
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
 	spa_close(spa, FTAG);
 
 	ddha.ddha_name = name;
@@ -884,7 +934,8 @@
 		objset_t *os;
 
 		error = dsl_sync_task(name, dsl_destroy_head_check,
-		    dsl_destroy_head_begin_sync, &ddha, 0);
+		    dsl_destroy_head_begin_sync, &ddha,
+		    0, ZFS_SPACE_CHECK_NONE);
 		if (error != 0)
 			return (error);
 
@@ -896,11 +947,12 @@
 		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
 		if (error == 0) {
 			uint64_t prev_snap_txg =
-			    dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+			    dsl_dataset_phys(dmu_objset_ds(os))->
+			    ds_prev_snap_txg;
 			for (uint64_t obj = 0; error == 0;
 			    error = dmu_object_next(os, &obj, FALSE,
 			    prev_snap_txg))
-				(void) dmu_free_object(os, obj);
+				(void) dmu_free_long_object(os, obj);
 			/* sync out all frees */
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			dmu_objset_disown(os, FTAG);
@@ -908,7 +960,7 @@
 	}
 
 	return (dsl_sync_task(name, dsl_destroy_head_check,
-	    dsl_destroy_head_sync, &ddha, 0));
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 /*
@@ -924,9 +976,17 @@
 	objset_t *os;
 
 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
-		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+		/*
+		 * If the dataset is inconsistent because a resumable receive
+		 * has failed, then do not destroy it.
+		 */
+		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+			need_destroy = B_FALSE;
+
 		dmu_objset_rele(os, FTAG);
-		if (inconsistent)
+		if (need_destroy)
 			(void) dsl_destroy_head(dsname);
 	}
 	return (0);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,10 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel at dawidek.net>.
  * All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -34,6 +37,7 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
 #include <sys/spa.h>
 #include <sys/metaslab.h>
 #include <sys/zap.h>
@@ -44,18 +48,100 @@
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
 #include "zfs_namecheck.h"
+#include "zfs_prop.h"
 
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
-/* ARGSUSED */
 static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
+dsl_dir_evict(void *dbu)
 {
-	dsl_dir_t *dd = arg;
+	dsl_dir_t *dd = dbu;
 	dsl_pool_t *dp = dd->dd_pool;
 	int t;
 
+	dd->dd_dbuf = NULL;
+
 	for (t = 0; t < TXG_SIZE; t++) {
 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 		ASSERT(dd->dd_tempreserved[t] == 0);
@@ -63,15 +149,11 @@
 	}
 
 	if (dd->dd_parent)
-		dsl_dir_rele(dd->dd_parent, dd);
+		dsl_dir_async_rele(dd->dd_parent, dd);
 
-	spa_close(dd->dd_pool->dp_spa, dd);
+	spa_async_close(dd->dd_pool->dp_spa, dd);
 
-	/*
-	 * The props callback list should have been cleaned up by
-	 * objset_evict().
-	 */
-	list_destroy(&dd->dd_prop_cbs);
+	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
@@ -94,7 +176,7 @@
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbuf, &doi);
-		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
 		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 	}
 #endif
@@ -105,17 +187,15 @@
 		dd->dd_object = ddobj;
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
-		dd->dd_phys = dbuf->db_data;
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+		dsl_prop_init(dd);
 
-		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
-		    offsetof(dsl_prop_cb_record_t, cbr_node));
-
 		dsl_dir_snap_cmtime_update(dd);
 
-		if (dd->dd_phys->dd_parent_obj) {
-			err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
-			    NULL, dd, &dd->dd_parent);
+		if (dsl_dir_phys(dd)->dd_parent_obj) {
+			err = dsl_dir_hold_obj(dp,
+			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+			    &dd->dd_parent);
 			if (err != 0)
 				goto errout;
 			if (tail) {
@@ -123,14 +203,16 @@
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
-				    tail, sizeof (foundobj), 1, &foundobj);
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj, tail,
+				    sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname);
 			}
 			if (err != 0)
@@ -149,7 +231,8 @@
 			 * Just look at its phys directly instead.
 			 */
 			err = dmu_bonus_hold(dp->dp_meta_objset,
-			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+			    &origin_bonus);
 			if (err != 0)
 				goto errout;
 			origin_phys = origin_bonus->db_data;
@@ -158,11 +241,12 @@
 			dmu_buf_rele(origin_bonus, FTAG);
 		}
 
-		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
-		    dsl_dir_evict);
-		if (winner) {
+		dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf);
+		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+		if (winner != NULL) {
 			if (dd->dd_parent)
 				dsl_dir_rele(dd->dd_parent, dd);
+			dsl_prop_fini(dd);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
@@ -190,6 +274,7 @@
 errout:
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
+	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
@@ -204,13 +289,29 @@
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
-/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released.  Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs.  This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
 void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_async_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
+void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
 {
 	if (dd->dd_parent) {
 		dsl_dir_name(dd->dd_parent, buf);
-		(void) strcat(buf, "/");
+		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+		    ZFS_MAX_DATASET_NAME_LEN);
 	} else {
 		buf[0] = '\0';
 	}
@@ -220,10 +321,12 @@
 		 * dprintf_dd() with dd_lock held
 		 */
 		mutex_enter(&dd->dd_lock);
-		(void) strcat(buf, dd->dd_myname);
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
 		mutex_exit(&dd->dd_lock);
 	} else {
-		(void) strcat(buf, dd->dd_myname);
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
 	}
 }
 
@@ -272,12 +375,12 @@
 		if (p != NULL &&
 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 			return (SET_ERROR(EINVAL));
-		if (strlen(path) >= MAXNAMELEN)
+		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strcpy(component, path);
 		p = NULL;
 	} else if (p[0] == '/') {
-		if (p - path >= MAXNAMELEN)
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
@@ -289,7 +392,7 @@
 		 */
 		if (strchr(path, '/'))
 			return (SET_ERROR(EINVAL));
-		if (p - path >= MAXNAMELEN)
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
 		component[p - path] = '\0';
@@ -311,7 +414,7 @@
 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
-	char buf[MAXNAMELEN];
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	const char *spaname, *next, *nextnext = NULL;
 	int err;
 	dsl_dir_t *dd;
@@ -324,7 +427,7 @@
 	/* Make sure the name is in the specified pool. */
 	spaname = spa_name(dp->dp_spa);
 	if (strcmp(buf, spaname) != 0)
-		return (SET_ERROR(EINVAL));
+		return (SET_ERROR(EXDEV));
 
 	ASSERT(dsl_pool_config_held(dp));
 
@@ -334,7 +437,7 @@
 	}
 
 	while (next != NULL) {
-		dsl_dir_t *child_ds;
+		dsl_dir_t *child_dd;
 		err = getcomponent(next, buf, &nextnext);
 		if (err != 0)
 			break;
@@ -342,10 +445,10 @@
 		if (next[0] == '@')
 			break;
 		dprintf("looking up %s in obj%lld\n",
-		    buf, dd->dd_phys->dd_child_dir_zapobj);
+		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
 		err = zap_lookup(dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
 		if (err != 0) {
 			if (err == ENOENT)
@@ -353,11 +456,11 @@
 			break;
 		}
 
-		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
+		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
 		if (err != 0)
 			break;
 		dsl_dir_rele(dd, tag);
-		dd = child_ds;
+		dd = child_dd;
 		next = nextnext;
 	}
 
@@ -383,6 +486,404 @@
 	return (err);
 }
 
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	uint64_t my_fs_cnt = 0;
+	uint64_t my_ss_cnt = 0;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *os = dp->dp_meta_objset;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	dsl_dataset_t *ds;
+
+	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dsl_dir_zapify(dd, tx);
+
+	/*
+	 * If the filesystem count has already been initialized then we
+	 * don't need to recurse down any further.
+	 */
+	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+		return;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/* Iterate my child dirs */
+	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+		dsl_dir_t *chld_dd;
+		uint64_t count;
+
+		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+		    &chld_dd));
+
+		/*
+		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+		 * temporary datasets.
+		 */
+		if (chld_dd->dd_myname[0] == '$' ||
+		    chld_dd->dd_myname[0] == '%') {
+			dsl_dir_rele(chld_dd, FTAG);
+			continue;
+		}
+
+		my_fs_cnt++;	/* count this child */
+
+		dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+		my_fs_cnt += count;
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+		my_ss_cnt += count;
+
+		dsl_dir_rele(chld_dd, FTAG);
+	}
+	zap_cursor_fini(zc);
+	/* Count my snapshots (we counted children's snapshots above) */
+	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		/* Don't count temporary snapshots */
+		if (za->za_name[0] != '%')
+			my_ss_cnt++;
+	}
+	zap_cursor_fini(zc);
+
+	dsl_dataset_rele(ds, FTAG);
+
+	kmem_free(zc, sizeof (zap_cursor_t));
+	kmem_free(za, sizeof (zap_attribute_t));
+
+	/* we're in a sync task, update counts */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	dd = ds->ds_dir;
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+	    dsl_dir_is_zapified(dd) &&
+	    zap_contains(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EALREADY));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	spa_t *spa;
+
+	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+	spa = dsl_dataset_get_spa(ds);
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		/*
+		 * Since the feature was not active and we're now setting a
+		 * limit, increment the feature-active counter so that the
+		 * feature becomes active for the first time.
+		 *
+		 * We are already in a sync task so we can update the MOS.
+		 */
+		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+	}
+
+	/*
+	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+	 * we need to ensure the counts are correct. Descend down the tree from
+	 * this point and update all of the counts to be accurate.
+	 */
+	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+	int error;
+
+	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+	    ZFS_SPACE_CHECK_RESERVED);
+
+	if (error == EALREADY)
+		error = 0;
+
+	return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+	ENFORCE_ALWAYS,
+	ENFORCE_NEVER,
+	ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+	enforce_res_t enforce = ENFORCE_ALWAYS;
+	uint64_t obj;
+	dsl_dataset_t *ds;
+	uint64_t zoned;
+
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+	if (jailed(cr))
+#else
+	if (crgetzoneid(cr) != GLOBAL_ZONEID)
+#endif
+		return (ENFORCE_ALWAYS);
+
+	if (secpolicy_zfs(cr) == 0)
+		return (ENFORCE_NEVER);
+#endif
+
+	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+		return (ENFORCE_ALWAYS);
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+		return (ENFORCE_ALWAYS);
+
+	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+		/* Only root can access zoned fs's from the GZ */
+		enforce = ENFORCE_ALWAYS;
+	} else {
+		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+			enforce = ENFORCE_ABOVE;
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+    dsl_dir_t *ancestor, cred_t *cr)
+{
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t limit, count;
+	char *count_prop;
+	enforce_res_t enforce;
+	int err = 0;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+	/*
+	 * If we're allowed to change the limit, don't enforce the limit
+	 * e.g. this can happen if a snapshot is taken by an administrative
+	 * user in the global zone (i.e. a recursive snapshot by root).
+	 * However, we must handle the case of delegated permissions where we
+	 * are allowed to change the limit on the current dataset, but there
+	 * is another limit in the tree above.
+	 */
+	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+	if (enforce == ENFORCE_NEVER)
+		return (0);
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment
+	 * is 0.
+	 */
+	if (delta == 0)
+		return (0);
+
+	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+		/*
+		 * We don't enforce the limit for temporary snapshots. This is
+		 * indicated by a NULL cred_t argument.
+		 */
+		if (cr == NULL)
+			return (0);
+
+		count_prop = DD_FIELD_SNAPSHOT_COUNT;
+	} else {
+		count_prop = DD_FIELD_FILESYSTEM_COUNT;
+	}
+
+	/*
+	 * If an ancestor has been provided, stop checking the limit once we
+	 * hit that dir. We need this during rename so that we don't overcount
+	 * the check once we recurse up to the common ancestor.
+	 */
+	if (ancestor == dd)
+		return (0);
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know there is no limit here (or above). The counts are
+	 * not valid on this node and we know we won't touch this node's counts.
+	 */
+	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+	    count_prop, sizeof (count), 1, &count) == ENOENT)
+		return (0);
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+	    B_FALSE);
+	if (err != 0)
+		return (err);
+
+	/* Is there a limit which we've hit? */
+	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+		return (SET_ERROR(EDQUOT));
+
+	if (dd->dd_parent != NULL)
+		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+		    ancestor, cr);
+
+	return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+    dmu_tx_t *tx)
+{
+	int err;
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t count;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+	/*
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We don't count this temporary
+	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+	 * $MOS & $ORIGIN) objsets.
+	 */
+	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+		return;
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+	 */
+	if (delta == 0)
+		return;
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we shouldn't touch this node's counts. An uninitialized count
+	 * on the node indicates that either the feature has not yet been
+	 * activated or there are no limits on this part of the tree.
+	 */
+	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+	    prop, sizeof (count), 1, &count)) == ENOENT)
+		return;
+	VERIFY0(err);
+
+	count += delta;
+	/* Use a signed verify to make sure we're not neg. */
+	VERIFY3S(count, >=, 0);
+
+	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+	    tx));
+
+	/* Roll up this additional count into our ancestors */
+	if (dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
@@ -395,7 +896,7 @@
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
-		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
@@ -407,8 +908,12 @@
 	ddphys = dbuf->db_data;
 
 	ddphys->dd_creation_time = gethrestime_sec();
-	if (pds)
+	if (pds) {
 		ddphys->dd_parent_obj = pds->dd_object;
+
+		/* update the filesystem counts */
+		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+	}
 	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	ddphys->dd_child_dir_zapobj = zap_create(mos,
@@ -423,9 +928,9 @@
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
-	return (dd->dd_phys->dd_origin_obj &&
+	return (dsl_dir_phys(dd)->dd_origin_obj &&
 	    (dd->dd_pool->dp_origin_snap == NULL ||
-	    dd->dd_phys->dd_origin_obj !=
+	    dsl_dir_phys(dd)->dd_origin_obj !=
 	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
@@ -434,35 +939,52 @@
 {
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-	    dd->dd_phys->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+	    dsl_dir_phys(dd)->dd_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+	    dsl_dir_phys(dd)->dd_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-	    dd->dd_phys->dd_reserved);
+	    dsl_dir_phys(dd)->dd_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
-	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
-	    dd->dd_phys->dd_compressed_bytes));
+	    dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+	    dsl_dir_phys(dd)->dd_compressed_bytes));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
-	    dd->dd_phys->dd_uncompressed_bytes);
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes);
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 	}
 	mutex_exit(&dd->dd_lock);
 
+	if (dsl_dir_is_zapified(dd)) {
+		uint64_t count;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (count), 1, &count) == 0) {
+			dsl_prop_nvlist_add_uint64(nv,
+			    ZFS_PROP_FILESYSTEM_COUNT, count);
+		}
+		if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (count), 1, &count) == 0) {
+			dsl_prop_nvlist_add_uint64(nv,
+			    ZFS_PROP_SNAPSHOT_COUNT, count);
+		}
+	}
+
 	if (dsl_dir_is_clone(dd)) {
 		dsl_dataset_t *ds;
-		char buf[MAXNAMELEN];
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
 
 		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &ds));
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
@@ -474,7 +996,7 @@
 {
 	dsl_pool_t *dp = dd->dd_pool;
 
-	ASSERT(dd->dd_phys);
+	ASSERT(dsl_dir_phys(dd));
 
 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
@@ -485,8 +1007,9 @@
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
-	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
-	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	uint64_t new_accounted =
+	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
 	return (new_accounted - old_accounted);
 }
 
@@ -545,9 +1068,9 @@
 	}
 
 	mutex_enter(&dd->dd_lock);
-	if (dd->dd_phys->dd_quota != 0)
-		quota = dd->dd_phys->dd_quota;
-	used = dd->dd_phys->dd_used_bytes;
+	if (dsl_dir_phys(dd)->dd_quota != 0)
+		quota = dsl_dir_phys(dd)->dd_quota;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	if (!ondiskonly)
 		used += dsl_dir_space_towrite(dd);
 
@@ -556,12 +1079,12 @@
 		quota = MIN(quota, poolsize);
 	}
 
-	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
 		/*
 		 * We have some space reserved, in addition to what our
 		 * parent gave us.
 		 */
-		parentspace += dd->dd_phys->dd_reserved - used;
+		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
 	}
 
 	if (dd == ancestor) {
@@ -590,7 +1113,6 @@
 
 struct tempreserve {
 	list_node_t tr_node;
-	dsl_pool_t *tr_dp;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
@@ -621,7 +1143,7 @@
 	est_inflight = dsl_dir_space_towrite(dd);
 	for (i = 0; i < TXG_SIZE; i++)
 		est_inflight += dd->dd_tempreserved[i];
-	used_on_disk = dd->dd_phys->dd_used_bytes;
+	used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
 	/*
 	 * On the first iteration, fetch the dataset's used-on-disk and
@@ -644,10 +1166,10 @@
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
-	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
-		quota = dd->dd_phys->dd_quota;
+		quota = dsl_dir_phys(dd)->dd_quota;
 
 	/*
 	 * Adjust the quota against the actual pool size at the root
@@ -701,7 +1223,7 @@
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent && parent_rsrv) {
-		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+		boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
 		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
@@ -741,24 +1263,24 @@
 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 		tr->tr_size = lsize;
 		list_insert_tail(tr_list, tr);
-
-		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			/*
+			 * If arc_memory_throttle() detected that pageout
+			 * is running and we are low on memory, we delay new
+			 * non-pageout transactions to give pageout an
+			 * advantage.
+			 *
+			 * It is unfortunate to be delaying while the caller's
+			 * locks are held.
+			 */
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    MSEC2NSEC(10), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
-		dsl_pool_memory_pressure(dd->dd_pool);
 	}
 
 	if (err == 0) {
-		struct tempreserve *tr;
-
-		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
-		tr->tr_dp = dd->dd_pool;
-		tr->tr_size = asize;
-		list_insert_tail(tr_list, tr);
-
 		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 		    FALSE, asize > usize, tr_list, tx, TRUE);
 	}
@@ -787,10 +1309,8 @@
 	if (tr_cookie == NULL)
 		return;
 
-	while (tr = list_head(tr_list)) {
-		if (tr->tr_dp) {
-			dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
-		} else if (tr->tr_ds) {
+	while ((tr = list_head(tr_list)) != NULL) {
+		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
@@ -806,8 +1326,14 @@
 	kmem_free(tr_list, sizeof (list_t));
 }
 
-static void
-dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
@@ -816,7 +1342,7 @@
 	if (space > 0)
 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
-	est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
+	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
 	parent_space = parent_delta(dd, est_used, space);
 	mutex_exit(&dd->dd_lock);
 
@@ -825,21 +1351,9 @@
 
 	/* XXX this is potentially expensive and unnecessary... */
 	if (parent_space && dd->dd_parent)
-		dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
+		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 }
 
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data.  Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
-	dsl_pool_willuse_space(dd->dd_pool, space, tx);
-	dsl_dir_willuse_space_impl(dd, space, tx);
-}
-
 /* call from syncing context when we actually write/free space for this dd */
 void
 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
@@ -846,34 +1360,44 @@
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
+
+	/*
+	 * dsl_dataset_set_refreservation_sync_impl() calls this with
+	 * dd_lock held, so that it can atomically update
+	 * ds->ds_reserved and the dsl_dir accounting, so that
+	 * dsl_dataset_check_quota() can see dataset and dir accounting
+	 * consistently.
+	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
-	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
-	ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
+	accounted_delta =
+	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 ||
-	    dd->dd_phys->dd_compressed_bytes >= -compressed);
+	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
-	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_used_bytes += used;
-	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
-	dd->dd_phys->dd_compressed_bytes += compressed;
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+	dsl_dir_phys(dd)->dd_used_bytes += used;
+	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
 
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(used > 0 ||
-		    dd->dd_phys->dd_used_breakdown[type] >= -used);
-		dd->dd_phys->dd_used_breakdown[type] += used;
+		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
 #ifdef DEBUG
 		dd_used_t t;
 		uint64_t u = 0;
 		for (t = 0; t < DD_USED_NUM; t++)
-			u += dd->dd_phys->dd_used_breakdown[t];
-		ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+			u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+		ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
 #endif
 	}
 	if (needlock)
@@ -884,7 +1408,7 @@
 		    accounted_delta, compressed, uncompressed, tx);
 		dsl_dir_transfer_space(dd->dd_parent,
 		    used - accounted_delta,
-		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+		    DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
 	}
 }
 
@@ -892,26 +1416,24 @@
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
-	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
-
-	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
-	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+	if (delta == 0 ||
+	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
-	if (needlock)
-		mutex_enter(&dd->dd_lock);
+	if (tx != NULL)
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
-	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
-	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
-	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
-	dd->dd_phys->dd_used_breakdown[newtype] += delta;
-	if (needlock)
-		mutex_exit(&dd->dd_lock);
+	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+	mutex_exit(&dd->dd_lock);
 }
 
 typedef struct dsl_dir_set_qr_arg {
@@ -954,8 +1476,8 @@
 	 */
 	towrite = dsl_dir_space_towrite(ds->ds_dir);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (newval < ds->ds_dir->dd_phys->dd_reserved ||
-	    newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
+	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
 		error = SET_ERROR(ENOSPC);
 	}
 	mutex_exit(&ds->ds_dir->dd_lock);
@@ -988,7 +1510,7 @@
 
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
-	ds->ds_dir->dd_phys->dd_quota = newval;
+	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 }
@@ -1003,7 +1525,7 @@
 	ddsqra.ddsqra_value = quota;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, &ddsqra, 0));
+	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 int
@@ -1039,7 +1561,7 @@
 	}
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
@@ -1049,13 +1571,13 @@
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}
 
-	if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
+	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
 		uint64_t delta = MAX(used, newval) -
-		    MAX(used, dd->dd_phys->dd_reserved);
+		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
 
 		if (delta > avail ||
-		    (dd->dd_phys->dd_quota > 0 &&
-		    newval > dd->dd_phys->dd_quota))
+		    (dsl_dir_phys(dd)->dd_quota > 0 &&
+		    newval > dsl_dir_phys(dd)->dd_quota))
 			error = SET_ERROR(ENOSPC);
 	}
 
@@ -1072,9 +1594,9 @@
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
-	dd->dd_phys->dd_reserved = value;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	dsl_dir_phys(dd)->dd_reserved = value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
@@ -1124,7 +1646,7 @@
 	ddsqra.ddsqra_value = reservation;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, &ddsqra, 0));
+	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static dsl_dir_t *
@@ -1151,7 +1673,7 @@
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
+	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
@@ -1159,6 +1681,7 @@
 typedef struct dsl_dir_rename_arg {
 	const char *ddra_oldname;
 	const char *ddra_newname;
+	cred_t *ddra_cred;
 } dsl_dir_rename_arg_t;
 
 /* ARGSUSED */
@@ -1166,11 +1689,11 @@
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	int *deltap = arg;
-	char namebuf[MAXNAMELEN];
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	dsl_dataset_name(ds, namebuf);
 
-	if (strlen(namebuf) + *deltap >= MAXNAMELEN)
+	if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
@@ -1202,7 +1725,7 @@
 	if (dd->dd_pool != newparent->dd_pool) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
-		return (SET_ERROR(ENXIO));
+		return (SET_ERROR(EXDEV));
 	}
 
 	/* new name should not already exist */
@@ -1223,11 +1746,58 @@
 		}
 	}
 
+	if (dmu_tx_is_syncing(tx)) {
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			/*
+			 * Although this is the check function and we don't
+			 * normally make on-disk changes in check functions,
+			 * we need to do that here.
+			 *
+			 * Ensure this portion of the tree's counts have been
+			 * initialized in case the new parent has limits set.
+			 */
+			dsl_dir_init_fs_ss_count(dd, tx);
+		}
+	}
+
 	if (newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
-		    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
+		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_reserved);
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
 
+		if (dsl_dir_is_zapified(dd)) {
+			int err;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+
+			/*
+			 * have to add 1 for the filesystem itself that we're
+			 * moving
+			 */
+			fs_cnt++;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+		}
+
 		/* no rename into our descendant */
 		if (closest_common_ancestor(dd, newparent) == dd) {
 			dsl_dir_rele(newparent, FTAG);
@@ -1236,7 +1806,7 @@
 		}
 
 		error = dsl_dir_transfer_possible(dd->dd_parent,
-		    newparent, myspace);
+		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
@@ -1268,18 +1838,50 @@
 	    "-> %s", ddra->ddra_newname);
 
 	if (newparent != dd->dd_parent) {
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
+
+		/*
+		 * We already made sure the dd counts were initialized in the
+		 * check function.
+		 */
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt));
+			/* add 1 for the filesystem itself that we're moving */
+			fs_cnt++;
+
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt));
+		}
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    -dd->dd_phys->dd_used_bytes,
-		    -dd->dd_phys->dd_compressed_bytes,
-		    -dd->dd_phys->dd_uncompressed_bytes, tx);
+		    -dsl_dir_phys(dd)->dd_used_bytes,
+		    -dsl_dir_phys(dd)->dd_compressed_bytes,
+		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
-		    dd->dd_phys->dd_used_bytes,
-		    dd->dd_phys->dd_compressed_bytes,
-		    dd->dd_phys->dd_uncompressed_bytes, tx);
+		    dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_compressed_bytes,
+		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 
-		if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
-			uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
-			    dd->dd_phys->dd_used_bytes;
+		if (dsl_dir_phys(dd)->dd_reserved >
+		    dsl_dir_phys(dd)->dd_used_bytes) {
+			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+			    dsl_dir_phys(dd)->dd_used_bytes;
 
 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 			    -unused_rsrv, 0, 0, tx);
@@ -1291,18 +1893,19 @@
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/* remove from old parent zapobj */
-	error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+	error = zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx);
 	ASSERT0(error);
 
 	(void) strcpy(dd->dd_myname, mynewname);
 	dsl_dir_rele(dd->dd_parent, dd);
-	dd->dd_phys->dd_parent_obj = newparent->dd_object;
+	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
 	VERIFY0(dsl_dir_hold_obj(dp,
 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
-	VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
+	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
 
 #ifdef __FreeBSD__
@@ -1325,17 +1928,21 @@
 
 	ddra.ddra_oldname = oldname;
 	ddra.ddra_newname = newname;
+	ddra.ddra_cred = CRED();
 
 	return (dsl_sync_task(oldname,
-	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
+	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+	    3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
+	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
@@ -1343,6 +1950,15 @@
 	if (avail < space)
 		return (SET_ERROR(ENOSPC));
 
+	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+	    ancestor, cr);
+	if (err != 0)
+		return (err);
+	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+	    ancestor, cr);
+	if (err != 0)
+		return (err);
+
 	return (0);
 }
 
@@ -1368,3 +1984,19 @@
 	dd->dd_snap_cmtime = t;
 	mutex_exit(&dd->dd_lock);
 }
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(dd->dd_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dsl_pool.h>
@@ -47,45 +49,189 @@
 #include <sys/zil_impl.h>
 #include <sys/dsl_userhold.h>
 
-int zfs_no_write_throttle = 0;
-int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 1000;		/* target millisecs to sync a txg */
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
 
-uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
-uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
-uint64_t zfs_write_limit_inflated = 0;
-uint64_t zfs_write_limit_override = 0;
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data.  See the
+ * comment above dmu_tx_delay() for details.
+ */
 
-kmutex_t zfs_write_limit_lock;
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
+ */
+uint64_t zfs_dirty_data_max;
+uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
+int zfs_dirty_data_max_percent = 10;
 
-static pgcnt_t old_physmem = 0;
+/*
+ * If there is at least this much dirty data, push out a txg.
+ */
+uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
 
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second.  This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+
+#ifdef __FreeBSD__
+
+extern int zfs_vdev_async_write_active_max_dirty_percent;
+
 SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN,
-    &zfs_no_write_throttle, 0, "");
-TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN,
-    &zfs_write_limit_shift, 0, "2^N of physical memory");
-SYSCTL_DECL(_vfs_zfs_txg);
-TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms);
-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN,
-    &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg");
 
-TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN,
-    &zfs_write_limit_min, 0, "Minimum write limit");
-TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN,
-    &zfs_write_limit_max, 0, "Maximum data payload per txg");
-TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN,
-    &zfs_write_limit_inflated, 0, "Maximum size of the dynamic write limit");
-TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
-    &zfs_write_limit_override, 0,
-    "Force a txg if dirty buffers exceed this value (bytes)");
+TUNABLE_QUAD("vfs.zfs.dirty_data_max", &zfs_dirty_data_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
+    &zfs_dirty_data_max, 0,
+    "The maximum amount of dirty data in bytes after which new writes are "
+    "halted until space becomes available");
 
+TUNABLE_QUAD("vfs.zfs.dirty_data_max_max", &zfs_dirty_data_max_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
+    &zfs_dirty_data_max_max, 0,
+    "The absolute cap on dirty_data_max when auto calculating");
+
+TUNABLE_INT("vfs.zfs.dirty_data_max_percent", &zfs_dirty_data_max_percent);
+static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_dirty_data_max_percent, "I",
+    "The percent of physical memory used to auto calculate dirty_data_max");
+
+TUNABLE_QUAD("vfs.zfs.dirty_data_sync", &zfs_dirty_data_sync);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
+    &zfs_dirty_data_sync, 0,
+    "Force a txg if the number of dirty buffer bytes exceed this value");
+
+static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+    sysctl_zfs_delay_min_dirty_percent, "I",
+    "The limit of outstanding dirty data before transactions are delayed");
+
+static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_scale tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_zfs_delay_scale, "QU",
+    "Controls how quickly the delay approaches infinity");
+
+static int
+sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_dirty_data_max_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 0 || val > 100)
+		return (EINVAL);
+
+	zfs_dirty_data_max_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_delay_min_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < zfs_vdev_async_write_active_max_dirty_percent)
+		return (EINVAL);
+
+	zfs_delay_min_dirty_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_delay_scale;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val > UINT64_MAX / zfs_dirty_data_max)
+		return (EINVAL);
+
+	zfs_delay_scale = val;
+
+	return (0);
+}
+#endif
+
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
@@ -93,7 +239,7 @@
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
-	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
@@ -111,7 +257,6 @@
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rrw_init(&dp->dp_config_rwlock, B_TRUE);
-	dp->dp_write_limit = zfs_write_limit_min;
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
@@ -124,6 +269,7 @@
 	    offsetof(dsl_sync_task_t, dst_node));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
@@ -175,11 +321,11 @@
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
-		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
-		    FTAG, &ds);
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
@@ -202,8 +348,14 @@
 		    dp->dp_meta_objset, obj));
 	}
 
-	if (spa_feature_is_active(dp->dp_spa,
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+	/*
+	 * Note: errors ignored, because the leak dir will not exist if we
+	 * have not encountered a leak yet.
+	 */
+	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+	    &dp->dp_leak_dir);
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj);
@@ -211,8 +363,7 @@
 			goto out;
 	}
 
-	if (spa_feature_is_active(dp->dp_spa,
-	    &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 		    &dp->dp_empty_bpobj);
@@ -238,9 +389,9 @@
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
-	/* drop our references from dsl_pool_open() */
-
 	/*
+	 * Drop our references from dsl_pool_open().
+	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
@@ -251,6 +402,8 @@
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
+	if (dp->dp_leak_dir)
+		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 
@@ -265,9 +418,18 @@
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
-	arc_flush(dp->dp_spa);
+	/*
+	 * We can't set retry to TRUE since we're explicitly specifying
+	 * a spa to flush. This is good enough; any missed buffers for
+	 * this spa won't cause trouble, and they'll eventually fall
+	 * out of the ARC just like any other unused buffer.
+	 */
+	arc_flush(dp->dp_spa, FALSE);
+
 	txg_fini(dp);
 	dsl_scan_fini(dp);
+	dmu_buf_user_evict_wait();
+
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_vnrele_taskq);
@@ -318,7 +480,7 @@
 		    FREE_DIR_NAME, &dp->dp_free_dir));
 
 		/* create and open the free_bplist */
-		obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -333,8 +495,10 @@
 
 	/* create the root objset */
 	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
@@ -362,14 +526,34 @@
 	mutex_exit(&dp->dp_lock);
 }
 
-static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 {
-	dsl_deadlist_t *dl = arg;
-	dsl_deadlist_insert(dl, bp, tx);
-	return (0);
+	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+	VERIFY0(zio_wait(zio));
+	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 }
 
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+	ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+	if (delta < 0)
+		ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+	dp->dp_dirty_total += delta;
+
+	/*
+	 * Note: we signal even when increasing dp_dirty_total.
+	 * This ensures forward progress -- each thread wakes the next waiter.
+	 */
+	if (dp->dp_dirty_total < zfs_dirty_data_max)
+		cv_signal(&dp->dp_spaceavail_cv);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -378,29 +562,18 @@
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	objset_t *mos = dp->dp_meta_objset;
-	hrtime_t start, write_time;
-	uint64_t data_written;
-	int err;
 	list_t synced_datasets;
 
 	list_create(&synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
+	tx = dmu_tx_create_assigned(dp, txg);
+
 	/*
-	 * We need to copy dp_space_towrite() before doing
-	 * dsl_sync_task_sync(), because
-	 * dsl_dataset_snapshot_reserve_space() will increase
-	 * dp_space_towrite but not actually write anything.
+	 * Write out all dirty blocks of dirty datasets.
 	 */
-	data_written = dp->dp_space_towrite[txg & TXG_MASK];
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	dp->dp_read_overhead = 0;
-	start = gethrtime();
-
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
@@ -410,20 +583,25 @@
 		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
-	DTRACE_PROBE(pool_sync__1setup);
-	err = zio_wait(zio);
+	VERIFY0(zio_wait(zio));
 
-	write_time = gethrtime() - start;
-	ASSERT(err == 0);
-	DTRACE_PROBE(pool_sync__2rootzio);
+	/*
+	 * We have written all of the accounted dirty data, so our
+	 * dp_space_towrite should now be zero.  However, some seldom-used
+	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
+	 * rounding error in dbuf_write_physdone).
+	 * Shore up the accounting of any dirtied space now.
+	 */
+	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group space accounting.
 	 */
-	for (ds = list_head(&synced_datasets); ds;
-	    ds = list_next(&synced_datasets, ds))
+	for (ds = list_head(&synced_datasets); ds != NULL;
+	    ds = list_next(&synced_datasets, ds)) {
 		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+	}
 
 	/*
 	 * Sync the datasets again to push out the changes due to
@@ -433,12 +611,12 @@
 	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
-	err = zio_wait(zio);
+	VERIFY0(zio_wait(zio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
@@ -447,18 +625,12 @@
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
 	 *  - release hold from dsl_dataset_dirty()
 	 */
-	while (ds = list_remove_head(&synced_datasets)) {
-		objset_t *os = ds->ds_objset;
-		bplist_iterate(&ds->ds_pending_deadlist,
-		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
-		ASSERT(!dmu_objset_is_dirty(os, txg));
-		dmu_buf_rele(ds->ds_dbuf, ds);
+	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+		dsl_dataset_sync_done(ds, tx);
 	}
-
-	start = gethrtime();
-	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
-	write_time += gethrtime() - start;
+	}
 
 	/*
 	 * The MOS's space is accounted for in the pool/$MOS
@@ -476,20 +648,10 @@
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
-	start = gethrtime();
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
-		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dmu_objset_sync(mos, zio, tx);
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
-		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+		dsl_pool_sync_mos(dp, tx);
 	}
-	write_time += gethrtime() - start;
-	DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
-	    hrtime_t, dp->dp_read_overhead);
-	write_time -= dp->dp_read_overhead;
 
 	/*
 	 * If we modify a dataset in the same txg that we want to destroy it,
@@ -500,7 +662,6 @@
 	 * The MOS data dirtied by the sync_tasks will be synced on the next
 	 * pass.
 	 */
-	DTRACE_PROBE(pool_sync__3task);
 	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 		/*
@@ -507,54 +668,14 @@
 		 * No more sync tasks should have been added while we
 		 * were syncing.
 		 */
-		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-		while (dst = txg_list_remove(&dp->dp_sync_tasks, txg))
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 			dsl_sync_task_sync(dst, tx);
 	}
 
 	dmu_tx_commit(tx);
 
-	dp->dp_space_towrite[txg & TXG_MASK] = 0;
-	ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
-
-	/*
-	 * If the write limit max has not been explicitly set, set it
-	 * to a fraction of available physical memory (default 1/8th).
-	 * Note that we must inflate the limit because the spa
-	 * inflates write sizes to account for data replication.
-	 * Check this each sync phase to catch changing memory size.
-	 */
-	if (physmem != old_physmem && zfs_write_limit_shift) {
-		mutex_enter(&zfs_write_limit_lock);
-		old_physmem = physmem;
-		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-		zfs_write_limit_inflated = MAX(zfs_write_limit_min,
-		    spa_get_asize(dp->dp_spa, zfs_write_limit_max));
-		mutex_exit(&zfs_write_limit_lock);
-	}
-
-	/*
-	 * Attempt to keep the sync time consistent by adjusting the
-	 * amount of write traffic allowed into each transaction group.
-	 * Weight the throughput calculation towards the current value:
-	 * 	thru = 3/4 old_thru + 1/4 new_thru
-	 *
-	 * Note: write_time is in nanosecs, so write_time/MICROSEC
-	 * yields millisecs
-	 */
-	ASSERT(zfs_write_limit_min > 0);
-	if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
-		uint64_t throughput = data_written / (write_time / MICROSEC);
-
-		if (dp->dp_throughput)
-			dp->dp_throughput = throughput / 4 +
-			    3 * dp->dp_throughput / 4;
-		else
-			dp->dp_throughput = throughput;
-		dp->dp_write_limit = MIN(zfs_write_limit_inflated,
-		    MAX(zfs_write_limit_min,
-		    dp->dp_throughput * zfs_txg_synctime_ms));
-	}
+	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
@@ -561,11 +682,17 @@
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	zilog_t *zilog;
-	dsl_dataset_t *ds;
 
-	while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
-		ds = dmu_objset_ds(zilog->zl_os);
+	while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
+		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+		/*
+		 * We don't remove the zilog from the dp_dirty_zilogs
+		 * list until after we've cleaned it. This ensures that
+		 * callers of zilog_is_dirty() receive an accurate
+		 * answer when they are racing with the spa sync thread.
+		 */
 		zil_clean(zilog, txg);
+		(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
@@ -589,17 +716,12 @@
 	uint64_t space, resv;
 
 	/*
-	 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
-	 * efficiency.
-	 * XXX The intent log is not accounted for, so it must fit
-	 * within this slop.
-	 *
 	 * If we're trying to assess whether it's OK to do a free,
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_dspace(dp->dp_spa);
-	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+	resv = spa_get_slop_space(dp->dp_spa);
 	if (netfree)
 		resv >>= 1;
 
@@ -606,82 +728,50 @@
 	return (space - resv);
 }
 
-int
-dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
-	uint64_t reserved = 0;
-	uint64_t write_limit = (zfs_write_limit_override ?
-	    zfs_write_limit_override : dp->dp_write_limit);
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	boolean_t rv;
 
-	if (zfs_no_write_throttle) {
-		atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
-		    space);
-		return (0);
-	}
-
-	/*
-	 * Check to see if we have exceeded the maximum allowed IO for
-	 * this transaction group.  We can do this without locks since
-	 * a little slop here is ok.  Note that we do the reserved check
-	 * with only half the requested reserve: this is because the
-	 * reserve requests are worst-case, and we really don't want to
-	 * throttle based off of worst-case estimates.
-	 */
-	if (write_limit > 0) {
-		reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
-		    + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
-
-		if (reserved && reserved > write_limit)
-			return (SET_ERROR(ERESTART));
-	}
-
-	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
-
-	/*
-	 * If this transaction group is over 7/8ths capacity, delay
-	 * the caller 1 clock tick.  This will slow down the "fill"
-	 * rate until the sync process can catch up with us.
-	 */
-	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
-
-	return (0);
+	mutex_enter(&dp->dp_lock);
+	if (dp->dp_dirty_total > zfs_dirty_data_sync)
+		txg_kick(dp);
+	rv = (dp->dp_dirty_total > delay_min_bytes);
+	mutex_exit(&dp->dp_lock);
+	return (rv);
 }
 
 void
-dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
-	ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
-	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+	if (space > 0) {
+		mutex_enter(&dp->dp_lock);
+		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+		dsl_pool_dirty_delta(dp, space);
+		mutex_exit(&dp->dp_lock);
+	}
 }
 
 void
-dsl_pool_memory_pressure(dsl_pool_t *dp)
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
-	uint64_t space_inuse = 0;
-	int i;
-
-	if (dp->dp_write_limit == zfs_write_limit_min)
+	ASSERT3S(space, >=, 0);
+	if (space == 0)
 		return;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		space_inuse += dp->dp_space_towrite[i];
-		space_inuse += dp->dp_tempreserved[i];
+	mutex_enter(&dp->dp_lock);
+	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+		/* XXX writing something we didn't dirty? */
+		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
-	dp->dp_write_limit = MAX(zfs_write_limit_min,
-	    MIN(dp->dp_write_limit, space_inuse / 4));
+	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+	ASSERT3U(dp->dp_dirty_total, >=, space);
+	dsl_pool_dirty_delta(dp, -space);
+	mutex_exit(&dp->dp_lock);
 }
 
-void
-dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
-{
-	if (space > 0) {
-		mutex_enter(&dp->dp_lock);
-		dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
-		mutex_exit(&dp->dp_lock);
-	}
-}
-
 /* ARGSUSED */
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
@@ -694,15 +784,15 @@
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
@@ -716,7 +806,9 @@
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
-		ASSERT0(prev->ds_phys->ds_bp.blk_birth);
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
@@ -725,33 +817,35 @@
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
-		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+		dsl_dataset_phys(ds)->ds_prev_snap_txg =
+		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_num_children++;
+		dsl_dataset_phys(prev)->ds_num_children++;
 
-		if (ds->ds_phys->ds_next_snap_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
 			VERIFY0(dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds, &ds->ds_prev));
 		}
 	}
 
-	ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
-	ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
-	if (prev->ds_phys->ds_next_clones_obj == 0) {
+	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_next_clones_obj =
+		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
 	VERIFY0(zap_add_int(dp->dp_meta_objset,
-	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
@@ -766,7 +860,7 @@
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN));
+	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 /* ARGSUSED */
@@ -776,20 +870,22 @@
 	dmu_tx_t *tx = arg;
 	objset_t *mos = dp->dp_meta_objset;
 
-	if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 		dsl_dataset_t *origin;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
-		if (origin->ds_dir->dd_phys->dd_clones == 0) {
+		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-			origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
-			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			dsl_dir_phys(origin->ds_dir)->dd_clones =
+			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+			    0, tx);
 		}
 
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
+		    dsl_dir_phys(origin->ds_dir)->dd_clones,
+		    ds->ds_object, tx));
 
 		dsl_dataset_rele(origin, FTAG);
 	}
@@ -812,13 +908,13 @@
 	 * subobj support.  So call dmu_object_alloc() directly.
 	 */
 	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-	    SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
@@ -836,7 +932,7 @@
 	    NULL, 0, kcred, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
-	VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
 }
@@ -1052,6 +1148,13 @@
 }
 
 void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
@@ -1062,3 +1165,9 @@
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -42,20 +43,20 @@
 #define	ZPROP_RECVD_SUFFIX "$recvd"
 
 static int
-dodefault(const char *propname, int intsz, int numints, void *buf)
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
 {
-	zfs_prop_t prop;
-
 	/*
 	 * The setonce properties are read-only, BUT they still
 	 * have a default value that can be used as the initial
 	 * value.
 	 */
-	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+	if (prop == ZPROP_INVAL ||
 	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
 		return (SET_ERROR(ENOENT));
 
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+		if (zfs_prop_default_string(prop) == NULL)
+			return (SET_ERROR(ENOENT));
 		if (intsz != 1)
 			return (SET_ERROR(EOVERFLOW));
 		(void) strncpy(buf, zfs_prop_default_string(prop),
@@ -105,8 +106,8 @@
 		}
 
 		/* Check for a local value. */
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    intsz, numints, buf);
+		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname, intsz, numints, buf);
 		if (err != ENOENT) {
 			if (setpoint != NULL && err == 0)
 				dsl_dir_name(dd, setpoint);
@@ -117,7 +118,7 @@
 		 * Skip the check for a received value if there is an explicit
 		 * inheritance entry.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 		    inheritstr);
 		if (err != 0 && err != ENOENT)
 			break;
@@ -124,7 +125,7 @@
 
 		if (err == ENOENT) {
 			/* Check for a received value. */
-			err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+			err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 			    recvdstr, intsz, numints, buf);
 			if (err != ENOENT) {
 				if (setpoint != NULL && err == 0) {
@@ -149,7 +150,7 @@
 	}
 
 	if (err == ENOENT)
-		err = dodefault(propname, intsz, numints, buf);
+		err = dodefault(prop, intsz, numints, buf);
 
 	strfree(inheritstr);
 	strfree(recvdstr);
@@ -163,19 +164,17 @@
 {
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t inheritable;
-	boolean_t snapshot;
 	uint64_t zapobj;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
-	snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
-	zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 
 	if (zapobj != 0) {
 		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 		int err;
 
-		ASSERT(snapshot);
+		ASSERT(ds->ds_is_snapshot);
 
 		/* Check for a local value. */
 		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
@@ -215,9 +214,61 @@
 	}
 
 	return (dsl_prop_get_dd(ds->ds_dir, propname,
-	    intsz, numints, buf, setpoint, snapshot));
+	    intsz, numints, buf, setpoint, ds->ds_is_snapshot));
 }
 
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr = NULL;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	for (pr = list_head(&dd->dd_props);
+	    pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+		if (strcmp(pr->pr_propname, propname) == 0)
+			break;
+	}
+
+	return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+	pr->pr_propname = spa_strdup(propname);
+	list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+	    offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+	list_insert_head(&dd->dd_props, pr);
+
+	return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+	list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+	    offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+	dsl_prop_record_t *pr;
+
+	while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+		list_destroy(&pr->pr_cbs);
+		strfree((char *)pr->pr_propname);
+		kmem_free(pr, sizeof (dsl_prop_record_t));
+	}
+	list_destroy(&dd->dd_props);
+}
+
 /*
  * Register interest in the named property.  We'll call the callback
  * once to notify it of the current property value, and again each time
@@ -232,6 +283,7 @@
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	uint64_t value;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 	int err;
 
@@ -243,12 +295,16 @@
 
 	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
 	cbr->cbr_ds = ds;
-	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
-	(void) strcpy((char *)cbr->cbr_propname, propname);
 	cbr->cbr_func = callback;
 	cbr->cbr_arg = cbarg;
+
 	mutex_enter(&dd->dd_lock);
-	list_insert_head(&dd->dd_prop_cbs, cbr);
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr == NULL)
+		pr = dsl_prop_record_create(dd, propname);
+	cbr->cbr_pr = pr;
+	list_insert_head(&pr->pr_cbs, cbr);
+	list_insert_head(&ds->ds_prop_cbs, cbr);
 	mutex_exit(&dd->dd_lock);
 
 	cbr->cbr_func(cbr->cbr_arg, value);
@@ -327,7 +383,7 @@
 	}
 
 	mos = dd->dd_pool->dp_meta_objset;
-	zapobj = dd->dd_phys->dd_props_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	version = spa_version(dd->dd_pool->dp_spa);
@@ -379,56 +435,34 @@
 }
 
 /*
- * Unregister this callback.  Return 0 on success, ENOENT if ddname is
- * invalid, or ENOMSG if no matching callback registered.
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
  */
-int
-dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg)
+void
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
 {
+	dsl_prop_cb_record_t *cbr, *next_cbr;
+
 	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds &&
-		    cbr->cbr_func == callback &&
-		    cbr->cbr_arg == cbarg &&
-		    strcmp(cbr->cbr_propname, propname) == 0)
-			break;
+	next_cbr = list_head(&ds->ds_prop_cbs);
+	while (next_cbr != NULL) {
+		cbr = next_cbr;
+		next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+		if (cbr->cbr_arg == cbarg) {
+			list_remove(&ds->ds_prop_cbs, cbr);
+			list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+			kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+		}
 	}
-
-	if (cbr == NULL) {
-		mutex_exit(&dd->dd_lock);
-		return (SET_ERROR(ENOMSG));
-	}
-
-	list_remove(&dd->dd_prop_cbs, cbr);
 	mutex_exit(&dd->dd_lock);
-	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
-	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
-
-	return (0);
 }
 
 boolean_t
 dsl_prop_hascb(dsl_dataset_t *ds)
 {
-	dsl_dir_t *dd = ds->ds_dir;
-	boolean_t rv = B_FALSE;
-	dsl_prop_cb_record_t *cbr;
-
-	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
-	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds) {
-			rv = B_TRUE;
-			break;
-		}
-	}
-	mutex_exit(&dd->dd_lock);
-	return (rv);
+	return (!list_is_empty(&ds->ds_prop_cbs));
 }
 
 /* ARGSUSED */
@@ -436,16 +470,50 @@
 dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
-	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		uint64_t value;
+	for (pr = list_head(&dd->dd_props);
+	    pr; pr = list_next(&dd->dd_props, pr)) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t value;
 
-		if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
-		    sizeof (value), 1, &value, NULL) == 0)
-			cbr->cbr_func(cbr->cbr_arg, value);
+			/*
+			 * Callback entries do not have holds on their
+			 * datasets so that datasets with registered
+			 * callbacks are still eligible for eviction.
+			 * Unlike operations to update properties on a
+			 * single dataset, we are performing a recursive
+			 * descent of related head datasets.  The caller
+			 * of this function only has a dataset hold on
+			 * the passed in head dataset, not the snapshots
+			 * associated with this dataset.  Without a hold,
+			 * the dataset pointer within callback records
+			 * for snapshots can be invalidated by eviction
+			 * at any time.
+			 *
+			 * Use dsl_dataset_try_add_ref() to verify
+			 * that the dataset for a snapshot has not
+			 * begun eviction processing and to prevent
+			 * eviction from occurring for the duration of
+			 * the callback.  If the hold attempt fails,
+			 * this object is already being evicted and the
+			 * callback can be safely ignored.
+			 */
+			if (ds != cbr->cbr_ds &&
+			    !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
+
+			if (dsl_prop_get_ds(cbr->cbr_ds,
+			    cbr->cbr_pr->pr_propname, sizeof (value), 1,
+			    &value, NULL) == 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
+
+			if (ds != cbr->cbr_ds)
+				dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
 	}
 	mutex_exit(&dd->dd_lock);
 
@@ -470,6 +538,7 @@
     const char *propname, uint64_t value, int first)
 {
 	dsl_dir_t *dd;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 	objset_t *mos = dp->dp_meta_objset;
 	zap_cursor_t zc;
@@ -486,7 +555,8 @@
 		 * If the prop is set here, then this change is not
 		 * being inherited here or below; stop the recursion.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname);
 		if (err == 0) {
 			dsl_dir_rele(dd, FTAG);
 			return;
@@ -495,27 +565,39 @@
 	}
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
-	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr != NULL) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t propobj;
 
-		if (strcmp(cbr->cbr_propname, propname) != 0)
-			continue;
+			/*
+			 * cbr->cbr_ds may be invalidated due to eviction,
+			 * requiring the use of dsl_dataset_try_add_ref().
+			 * See comment block in dsl_prop_notify_all_cb()
+			 * for details.
+			 */
+			if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
 
-		/*
-		 * If the property is set on this ds, then it is not
-		 * inherited here; don't call the callback.
-		 */
-		if (propobj && 0 == zap_contains(mos, propobj, propname))
-			continue;
+			propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
 
-		cbr->cbr_func(cbr->cbr_arg, value);
+			/*
+			 * If the property is not set on this ds, then it is
+			 * inherited here; call the callback.
+			 */
+			if (propobj == 0 ||
+			    zap_contains(mos, propobj, propname) != 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
+
+			dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
 	}
 	mutex_exit(&dd->dd_lock);
 
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, mos,
-	    dd->dd_phys->dd_child_dir_zapobj);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_prop_changed_notify(dp, za->za_first_integer,
@@ -542,19 +624,19 @@
 	int err;
 	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
 
-	isint = (dodefault(propname, 8, 1, &intval) == 0);
+	isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
 
-	if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
-		if (ds->ds_phys->ds_props_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			ds->ds_phys->ds_props_obj =
+			dsl_dataset_phys(ds)->ds_props_obj =
 			    zap_create(mos,
 			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 		}
-		zapobj = ds->ds_phys->ds_props_obj;
+		zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 	} else {
-		zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+		zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
 	}
 
 	if (version < SPA_VERSION_RECVD_PROPS) {
@@ -641,7 +723,7 @@
 	if (isint) {
 		VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
 
-		if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+		if (ds->ds_is_snapshot) {
 			dsl_prop_cb_record_t *cbr;
 			/*
 			 * It's a snapshot; nothing can inherit this
@@ -649,10 +731,10 @@
 			 * ds here.
 			 */
 			mutex_enter(&ds->ds_dir->dd_lock);
-			for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
-			    cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
-				if (cbr->cbr_ds == ds &&
-				    strcmp(cbr->cbr_propname, propname) == 0)
+			for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+			    cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+				if (strcmp(cbr->cbr_pr->pr_propname,
+				    propname) == 0)
 					cbr->cbr_func(cbr->cbr_arg, intval);
 			}
 			mutex_exit(&ds->ds_dir->dd_lock);
@@ -759,7 +841,7 @@
 		}
 	}
 
-	if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) {
+	if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
@@ -835,7 +917,7 @@
 		nblks = 2 * fnvlist_num_pairs(props);
 
 	return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
-	    &dpsa, nblks));
+	    &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
 }
 
 typedef enum dsl_prop_getflags {
@@ -978,20 +1060,20 @@
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	int err = 0;
-	char setpoint[MAXNAMELEN];
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	if (dsl_dataset_is_snapshot(ds))
+	if (ds->ds_is_snapshot)
 		flags |= DSL_PROP_GET_SNAPSHOT;
 
 	ASSERT(dsl_pool_config_held(dp));
 
-	if (ds->ds_phys->ds_props_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
 		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
 		dsl_dataset_name(ds, setpoint);
-		err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
 		if (err)
 			goto out;
 	}
@@ -1004,8 +1086,8 @@
 			flags |= DSL_PROP_GET_INHERITING;
 		}
 		dsl_dir_name(dd, setpoint);
-		err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
 		if (err)
 			break;
 	}
@@ -1100,7 +1182,7 @@
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
 	/* Indicate the default source if we can. */
-	if (dodefault(propname, 8, 1, &default_value) == 0 &&
+	if (dodefault(prop, 8, 1, &default_value) == 0 &&
 	    value == default_value) {
 		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
 	}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
  */
 
 #include <sys/dsl_scan.h>
@@ -51,13 +52,13 @@
 #include <sys/zfs_vfsops.h>
 #endif
 
-typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+    const zbookmark_phys_t *);
 
-static scan_cb_t dsl_scan_defrag_cb;
 static scan_cb_t dsl_scan_scrub_cb;
-static scan_cb_t dsl_scan_remove_cb;
 static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
+static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
 
 unsigned int zfs_top_maxinflight = 32;	/* maximum I/Os per top-level */
 unsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
@@ -69,7 +70,7 @@
 unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
 						 per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight);
@@ -101,7 +102,12 @@
     &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
 
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+uint64_t zfs_free_max_blocks = UINT64_MAX;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
+    &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
 
+
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -108,6 +114,14 @@
 
 extern int zfs_txg_timeout;
 
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+boolean_t zfs_free_bpobj_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
+    &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
+
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
@@ -133,7 +147,7 @@
 	 */
 	ASSERT(!scn->scn_async_destroying);
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
-	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+	    SPA_FEATURE_ASYNC_DESTROY);
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
@@ -225,6 +239,7 @@
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_restart_txg = 0;
+	scn->scn_done_txg = 0;
 	spa_scan_stat_init(spa);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -316,8 +331,15 @@
 	else
 		scn->scn_phys.scn_state = DSS_CANCELED;
 
-	spa_history_log_internal(spa, "scan done", tx,
-	    "complete=%u", complete);
+	if (dsl_scan_restarting(scn, tx))
+		spa_history_log_internal(spa, "scan aborted, restarting", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
+	else if (!complete)
+		spa_history_log_internal(spa, "scan cancelled", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
+	else
+		spa_history_log_internal(spa, "scan done", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
@@ -377,16 +399,15 @@
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
-	    dsl_scan_cancel_sync, NULL, 3));
+	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
-static void dsl_scan_visitbp(blkptr_t *bp,
-    const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
-    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
-    dmu_tx_t *tx);
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
     dmu_objset_type_t ostype,
-    dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
@@ -406,8 +427,8 @@
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
-	if (dsl_dataset_is_snapshot(ds))
-		return (MIN(smt, ds->ds_phys->ds_creation_txg));
+	if (ds->ds_is_snapshot)
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 	return (smt);
 }
 
@@ -420,12 +441,11 @@
 	    &scn->scn_phys, tx));
 }
 
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
 static boolean_t
-dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
-	uint64_t elapsed_nanosecs;
-	unsigned int mintime;
-
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
@@ -440,12 +460,28 @@
 	if (zb && zb->zb_level != 0)
 		return (B_FALSE);
 
-	mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	/*
+	 * We pause if:
+	 *  - we have scanned for the maximum time: an entire txg
+	 *    timeout (default 5 sec)
+	 *  or
+	 *  - we have scanned for at least the minimum time (default 1 sec
+	 *    for scrub, 3 sec for resilver), and either we have sufficient
+	 *    dirty data that we are starting to write more quickly
+	 *    (default 30%), or someone is explicitly waiting for this txg
+	 *    to complete.
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 */
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
-	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > mintime &&
-	    txg_sync_waiting(scn->scn_dp)) ||
+	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
+	    (txg_sync_waiting(scn->scn_dp) ||
+	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
@@ -479,9 +515,9 @@
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
@@ -511,9 +547,10 @@
 		zil_header_t *zh = zsa->zsa_zh;
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_t zb;
+		zbookmark_phys_t zb;
 
-		if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		if (BP_IS_HOLE(bp) ||
+		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
@@ -560,8 +597,8 @@
 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
     uint64_t objset, uint64_t object, uint64_t blkid)
 {
-	zbookmark_t czb;
-	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+	zbookmark_phys_t czb;
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	if (zfs_no_scrub_prefetch)
 		return;
@@ -579,7 +616,7 @@
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
-    const zbookmark_t *zb)
+    const zbookmark_phys_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
@@ -590,7 +627,8 @@
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+		if (zbookmark_subtree_completed(dnp, zb,
+		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
@@ -619,7 +657,7 @@
 static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
-    const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+    const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
@@ -626,68 +664,64 @@
 	int err;
 
 	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
-		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
-			dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
 			    zb->zb_object, zb->zb_blkid * epb + i);
 		}
-		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
-			zbookmark_t czb;
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
-			    *bufp, ds, scn, ostype, tx);
+			    ds, scn, ostype, tx);
 		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
-		uint32_t flags = ARC_WAIT;
-
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
-		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
-		if (err) {
-			scn->scn_phys.scn_errors++;
-			return (err);
-		}
+		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
 		int i, j;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		arc_buf_t *buf;
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
-		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
-				dsl_scan_prefetch(scn, *bufp, cbp,
+				dsl_scan_prefetch(scn, buf, cbp,
 				    zb->zb_objset, zb->zb_blkid * epb + i, j);
 			}
 		}
-		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 			dsl_scan_visitdnode(scn, ds, ostype,
-			    cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
 
+		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
+		arc_buf_t *buf;
 
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
@@ -694,12 +728,12 @@
 			return (err);
 		}
 
-		osp = (*bufp)->b_data;
+		osp = buf->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
-		    &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 
-		if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			/*
 			 * We also always visit user/group accounting
 			 * objects, and never skip them, even if we are
@@ -707,12 +741,13 @@
 			 * deltas from this txg get integrated.
 			 */
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
-			    &osp->os_groupused_dnode, *bufp,
+			    &osp->os_groupused_dnode,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
-			    &osp->os_userused_dnode, *bufp,
+			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
+		arc_buf_destroy(buf, &buf);
 	}
 
 	return (0);
@@ -720,26 +755,26 @@
 
 static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
-    dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		zbookmark_t czb;
+		zbookmark_phys_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
-		    &czb, dnp, buf, ds, scn, ostype, tx);
+		    &czb, dnp, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-		zbookmark_t czb;
+		zbookmark_phys_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(&dnp->dn_spill,
-		    &czb, dnp, buf, ds, scn, ostype, tx);
+		    &czb, dnp, ds, scn, ostype, tx);
 	}
 }
 
@@ -748,10 +783,9 @@
  * first 5; we want them to be useful.
  */
 static void
-dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
-    dnode_phys_t *dnp, arc_buf_t *pbuf,
-    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
-    dmu_tx_t *tx)
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	arc_buf_t *buf = NULL;
@@ -765,22 +799,21 @@
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
-	if (bp->blk_birth == 0)
+	if (BP_IS_HOLE(bp))
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	dprintf_bp(bp,
-	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
 	    ds, ds ? ds->ds_object : 0,
 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
-	    pbuf, bp);
+	    bp);
 
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return;
 
-	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
-	    &buf) != 0)
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
 		return;
 
 	/*
@@ -801,11 +834,9 @@
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
 		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 	}
-	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
 }
 
 static void
@@ -812,11 +843,11 @@
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	dsl_scan_visitbp(bp, &zb, NULL, NULL,
+	dsl_scan_visitbp(bp, &zb, NULL,
 	    ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
@@ -833,14 +864,24 @@
 		return;
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
-		if (dsl_dataset_is_snapshot(ds)) {
-			/* Note, scn_cur_{min,max}_txg stays the same. */
+		if (ds->ds_is_snapshot) {
+			/*
+			 * Note:
+			 *  - scn_cur_{min,max}_txg stays the same.
+			 *  - Setting the flag is not really necessary if
+			 *    scn_cur_max_txg == scn_max_txg, because there
+			 *    is nothing after this snapshot that we care
+			 *    about.  However, we set it anyway and then
+			 *    ignore it when we retraverse it in
+			 *    dsl_scan_visitds().
+			 */
 			scn->scn_phys.scn_bookmark.zb_objset =
-			    ds->ds_phys->ds_next_snap_obj;
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
 			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
@@ -851,10 +892,10 @@
 		}
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
-		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
-		if (dsl_dataset_is_snapshot(ds)) {
+		if (ds->ds_is_snapshot) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
@@ -862,18 +903,17 @@
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
-			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+			    dsl_dataset_phys(ds)->ds_next_snap_obj,
+			    mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu; in queue; removing",
 			    (u_longlong_t)ds->ds_object);
 		}
-	} else {
-		zfs_dbgmsg("destroying ds %llu; ignoring",
-		    (u_longlong_t)ds->ds_object);
 	}
 
 	/*
@@ -893,15 +933,15 @@
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset =
-		    ds->ds_phys->ds_prev_snap_obj;
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -908,11 +948,11 @@
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
-		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 	dsl_scan_sync_state(scn, tx);
 }
@@ -945,8 +985,8 @@
 	    ds1->ds_object, &mintxg) == 0) {
 		int err;
 
-		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		err = zap_add_int_key(dp->dp_meta_objset,
@@ -964,8 +1004,8 @@
 		    (u_longlong_t)ds2->ds_object);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
-		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
-		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
@@ -993,7 +1033,7 @@
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
-	if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
@@ -1000,10 +1040,10 @@
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
 		dsl_dataset_rele(ds, FTAG);
 		if (err)
@@ -1012,7 +1052,7 @@
 	}
 	VERIFY(zap_add_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object,
-	    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -1026,6 +1066,46 @@
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
+	if (scn->scn_phys.scn_cur_min_txg >=
+	    scn->scn_phys.scn_max_txg) {
+		/*
+		 * This can happen if this snapshot was created after the
+		 * scan started, and we already completed a previous snapshot
+		 * that was created after the scan started.  This snapshot
+		 * only references blocks with:
+		 *
+		 *	birth < our ds_creation_txg
+		 *	cur_min_txg is no less than ds_creation_txg.
+		 *	We have already visited these blocks.
+		 * or
+		 *	birth > scn_max_txg
+		 *	The scan requested not to visit these blocks.
+		 *
+		 * Subsequent snapshots (and clones) can reference our
+		 * blocks, or blocks with even higher birth times.
+		 * Therefore we do not need to visit them either,
+		 * so we do not add them to the work queue.
+		 *
+		 * Note that checking for cur_min_txg >= cur_max_txg
+		 * is not sufficient, because in that case we may need to
+		 * visit subsequent snapshots.  This happens when min_txg > 0,
+		 * which raises cur_min_txg.  In this case we will visit
+		 * this dataset but skip all of its blocks, because the
+		 * rootbp's birth time is < cur_min_txg.  Then we will
+		 * add the next snapshots/clones to the work queue.
+		 */
+		char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+		dsl_dataset_name(ds, dsname);
+		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+		    "cur_min_txg (%llu) >= max_txg (%llu)",
+		    dsobj, dsname,
+		    scn->scn_phys.scn_cur_min_txg,
+		    scn->scn_phys.scn_max_txg);
+		kmem_free(dsname, MAXNAMELEN);
+
+		goto out;
+	}
+
 	if (dmu_objset_from_ds(ds, &os))
 		goto out;
 
@@ -1036,7 +1116,7 @@
 	 * ZIL here, rather than in scan_recurse(), because the regular
 	 * snapshot block-sharing rules don't apply to it.
 	 */
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
 		dsl_scan_zil(dp, &os->os_zil_header);
 
 	/*
@@ -1043,9 +1123,11 @@
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
-	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "pausing=%u",
@@ -1053,7 +1135,7 @@
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_pausing);
-	kmem_free(dsname, ZFS_MAXNAMELEN);
+	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	if (scn->scn_pausing)
 		goto out;
@@ -1077,14 +1159,15 @@
 	/*
 	 * Add descendent datasets to work queue.
 	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
-		    ds->ds_phys->ds_creation_txg, tx) == 0);
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_next_snap_obj,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
 	}
-	if (ds->ds_phys->ds_num_children > 1) {
+	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
-		if (ds->ds_phys->ds_next_clones_obj != 0) {
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
@@ -1094,17 +1177,17 @@
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj, &count);
+			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
 			if (err == 0 &&
-			    count == ds->ds_phys->ds_num_children - 1)
+			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			VERIFY0(zap_join_key(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj,
 			    scn->scn_phys.scn_queue_obj,
-			    ds->ds_phys->ds_creation_txg, tx));
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx));
 		} else {
 			struct enqueue_clones_arg eca;
 			eca.tx = tx;
@@ -1132,10 +1215,10 @@
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
@@ -1144,7 +1227,7 @@
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
@@ -1154,7 +1237,7 @@
 	}
 
 	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+	    ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -1239,7 +1322,7 @@
 	const ddt_key_t *ddk = &dde->dde_key;
 	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
-	zbookmark_t zb = { 0 };
+	zbookmark_phys_t zb = { 0 };
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
@@ -1246,7 +1329,7 @@
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, ddp, &bp);
 
@@ -1307,7 +1390,7 @@
 	 * In case we were paused right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
-	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
 
 	/* keep pulling things out of the zap-object-as-queue */
 	while (zap_cursor_init(&zc, dp->dp_meta_objset,
@@ -1329,7 +1412,7 @@
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
-			    ds->ds_phys->ds_prev_snap_txg);
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
@@ -1347,9 +1430,15 @@
 {
 	uint64_t elapsed_nanosecs;
 
+	if (zfs_recover)
+		return (B_FALSE);
+
+	if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+		return (B_TRUE);
+
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
@@ -1384,9 +1473,8 @@
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
-
 	if (scn->scn_phys.scn_state == DSS_SCANNING ||
-	    scn->scn_async_destroying)
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@@ -1401,7 +1489,7 @@
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	int err;
+	int err = 0;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
@@ -1408,8 +1496,7 @@
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init).
 	 */
-	if (scn->scn_restart_txg != 0 &&
-	    scn->scn_restart_txg <= tx->tx_txg) {
+	if (dsl_scan_restarting(scn, tx)) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
@@ -1419,10 +1506,26 @@
 		dsl_scan_setup_sync(&func, tx);
 	}
 
-	if (!dsl_scan_active(scn) ||
-	    spa_sync_pass(dp->dp_spa) > 1)
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+	if (spa_sync_pass(dp->dp_spa) > 1)
 		return;
 
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+		return;
+
 	scn->scn_visited_this_txg = 0;
 	scn->scn_pausing = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
@@ -1429,12 +1532,14 @@
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
-	 * First process the free list.  If we pause the free, don't do
-	 * any scanning.  This ensures that there is no free list when
-	 * we are scanning, so the scan code doesn't have to worry about
-	 * traversing it.
+	 * First process the async destroys.  If we pause, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
 	 */
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+	if (zfs_free_bpobj_enabled &&
+	    spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -1442,53 +1547,117 @@
 		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
 
-		if (err == 0 && spa_feature_is_active(spa,
-		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
-			ASSERT(scn->scn_async_destroying);
-			scn->scn_is_bptree = B_TRUE;
-			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-			    NULL, ZIO_FLAG_MUSTSUCCEED);
-			err = bptree_iterate(dp->dp_meta_objset,
-			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
-			    scn, tx);
-			VERIFY0(zio_wait(scn->scn_zio_root));
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+	}
 
-			if (err == 0) {
-				zfeature_info_t *feat = &spa_feature_table
-				    [SPA_FEATURE_ASYNC_DESTROY];
-				/* finished; deactivate async destroy feature */
-				spa_feature_decr(spa, feat, tx);
-				ASSERT(!spa_feature_is_active(spa, feat));
-				VERIFY0(zap_remove(dp->dp_meta_objset,
-				    DMU_POOL_DIRECTORY_OBJECT,
-				    DMU_POOL_BPTREE_OBJ, tx));
-				VERIFY0(bptree_free(dp->dp_meta_objset,
-				    dp->dp_bptree_obj, tx));
-				dp->dp_bptree_obj = 0;
-				scn->scn_async_destroying = B_FALSE;
-			}
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		ASSERT(scn->scn_async_destroying);
+		scn->scn_is_bptree = B_TRUE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bptree_iterate(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+
+		if (err == EIO || err == ECKSUM) {
+			err = 0;
+		} else if (err != 0 && err != ERESTART) {
+			zfs_panic_recover("error %u from "
+			    "traverse_dataset_destroyed()", err);
 		}
-		if (scn->scn_visited_this_txg) {
-			zfs_dbgmsg("freed %llu blocks in %llums from "
-			    "free_bpobj/bptree txg %llu",
-			    (longlong_t)scn->scn_visited_this_txg,
-			    (longlong_t)
-			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
-			    (longlong_t)tx->tx_txg);
-			scn->scn_visited_this_txg = 0;
+
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+			/* finished; deactivate async destroy feature */
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY));
+			VERIFY0(zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY0(bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+			scn->scn_async_destroying = B_FALSE;
+			scn->scn_async_stalled = B_FALSE;
+		} else {
 			/*
-			 * Re-sync the ddt so that we can further modify
-			 * it when doing bprewrite.
+			 * If we didn't make progress, mark the async
+			 * destroy as stalled, so that we will not initiate
+			 * a spa_sync() on its behalf.  Note that we only
+			 * check this if we are not finished, because if the
+			 * bptree had no blocks for us to visit, we can
+			 * finish without "making progress".
 			 */
-			ddt_sync(spa, tx->tx_txg);
+			scn->scn_async_stalled =
+			    (scn->scn_visited_this_txg == 0);
 		}
-		if (err == ERESTART)
-			return;
 	}
+	if (scn->scn_visited_this_txg) {
+		zfs_dbgmsg("freed %llu blocks in %llums from "
+		    "free_bpobj/bptree txg %llu; err=%d",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)
+		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		    (longlong_t)tx->tx_txg, err);
+		scn->scn_visited_this_txg = 0;
 
+		/*
+		 * Write out changes to the DDT that may be required as a
+		 * result of the blocks freed.  This ensures that the DDT
+		 * is clean when a scrub/resilver runs.
+		 */
+		ddt_sync(spa, tx->tx_txg);
+	}
+	if (err != 0)
+		return;
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+	    zfs_free_leak_on_eio &&
+	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+		/*
+		 * We have finished background destroying, but there is still
+		 * some space left in the dp_free_dir. Transfer this leaked
+		 * space to the dp_leak_dir.
+		 */
+		if (dp->dp_leak_dir == NULL) {
+			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+			    LEAK_DIR_NAME, tx);
+			VERIFY0(dsl_pool_open_special_dir(dp,
+			    LEAK_DIR_NAME, &dp->dp_leak_dir));
+			rrw_exit(&dp->dp_config_rwlock, FTAG);
+		}
+		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+	}
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
+		/* finished; verify that space accounting went to zero */
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+	}
+
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
+	if (scn->scn_done_txg == tx->tx_txg) {
+		ASSERT(!scn->scn_pausing);
+		/* finished with scan. */
+		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
+		dsl_scan_done(scn, B_TRUE, tx);
+		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
+		dsl_scan_sync_state(scn, tx);
+		return;
+	}
+
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1521,12 +1690,12 @@
 
 	zfs_dbgmsg("visited %llu blocks in %llums",
 	    (longlong_t)scn->scn_visited_this_txg,
-	    (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
 
 	if (!scn->scn_pausing) {
-		/* finished with scan. */
-		zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
-		dsl_scan_done(scn, B_TRUE, tx);
+		scn->scn_done_txg = tx->tx_txg + 1;
+		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
+		    tx->tx_txg, scn->scn_done_txg);
 	}
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -1640,7 +1809,7 @@
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
-    const blkptr_t *bp, const zbookmark_t *zb)
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
@@ -1648,7 +1817,6 @@
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-	int zio_priority;
 	unsigned int scan_delay = 0;
 
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
@@ -1657,16 +1825,17 @@
 
 	count_block(dp->dp_blkstats, bp);
 
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
-		zio_priority = ZIO_PRIORITY_SCRUB;
 		needs_io = B_TRUE;
 		scan_delay = zfs_scrub_delay;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
-		zio_priority = ZIO_PRIORITY_RESILVER;
 		needs_io = B_FALSE;
 		scan_delay = zfs_resilver_delay;
 	}
@@ -1725,7 +1894,7 @@
 			delay(MAX((int)scan_delay, 0));
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
-		    dsl_scan_scrub_done, NULL, zio_priority,
+		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
 		    zio_flags, zb));
 	}
 
@@ -1752,5 +1921,12 @@
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-	    dsl_scan_setup_sync, &func, 0));
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
 }
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	return (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -65,7 +65,8 @@
  */
 int
 dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified)
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
 {
 	spa_t *spa;
 	dmu_tx_t *tx;
@@ -85,6 +86,7 @@
 	dst.dst_pool = dp;
 	dst.dst_txg = dmu_tx_get_txg(tx);
 	dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+	dst.dst_space_check = space_check;
 	dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
 	dst.dst_syncfunc = syncfunc;
 	dst.dst_arg = arg;
@@ -118,7 +120,7 @@
 
 void
 dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, dmu_tx_t *tx)
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
 {
 	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
@@ -125,6 +127,7 @@
 	dst->dst_pool = dp;
 	dst->dst_txg = dmu_tx_get_txg(tx);
 	dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+	dst->dst_space_check = space_check;
 	dst->dst_checkfunc = dsl_null_checkfunc;
 	dst->dst_syncfunc = syncfunc;
 	dst->dst_arg = arg;
@@ -141,25 +144,34 @@
 dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dst->dst_pool;
-	uint64_t quota, used;
 
 	ASSERT0(dst->dst_error);
 
 	/*
-	 * Check for sufficient space.  We just check against what's
-	 * on-disk; we don't want any in-flight accounting to get in our
-	 * way, because open context may have already used up various
-	 * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
+	 * Check for sufficient space.
+	 *
+	 * When the sync task was created, the caller specified the
+	 * type of space checking required.  See the comment in
+	 * zfs_space_check_t for details on the semantics of each
+	 * type of space checking.
+	 *
+	 * We just check against what's on-disk; we don't want any
+	 * in-flight accounting to get in our way, because open context
+	 * may have already used up various in-core limits
+	 * (arc_tempreserve, dsl_pool_tempreserve).
 	 */
-	quota = dsl_pool_adjustedsize(dp, B_FALSE) -
-	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
-	used = dp->dp_root_dir->dd_phys->dd_used_bytes;
-	/* MOS space is triple-dittoed, so we multiply by 3. */
-	if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
-		dst->dst_error = SET_ERROR(ENOSPC);
-		if (dst->dst_nowaiter)
-			kmem_free(dst, sizeof (*dst));
-		return;
+	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_adjustedsize(dp,
+		    dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
+		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+		/* MOS space is triple-dittoed, so we multiply by 3. */
+		if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+			dst->dst_error = SET_ERROR(ENOSPC);
+			if (dst->dst_nowaiter)
+				kmem_free(dst, sizeof (*dst));
+			return;
+		}
 	}
 
 	/*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
@@ -65,10 +65,10 @@
 		return (SET_ERROR(E2BIG));
 
 	/* tags must be unique (if ds already exists) */
-	if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) {
+	if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 		uint64_t value;
 
-		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj,
+		error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
 		    htag, 8, 1, &value);
 		if (error == 0)
 			error = SET_ERROR(EEXIST);
@@ -141,16 +141,16 @@
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
-	if (ds->ds_phys->ds_userrefs_obj == 0) {
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
 		/*
 		 * This is the first user hold for this dataset.  Create
 		 * the userrefs zap object.
 		 */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		zapobj = ds->ds_phys->ds_userrefs_obj =
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
 		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
 	} else {
-		zapobj = ds->ds_phys->ds_userrefs_obj;
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
 	}
 	ds->ds_userrefs++;
 
@@ -181,7 +181,7 @@
 }
 
 typedef struct zfs_hold_cleanup_arg {
-	char zhca_spaname[MAXNAMELEN];
+	char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t zhca_spa_load_guid;
 	nvlist_t *zhca_holds;
 } zfs_hold_cleanup_arg_t;
@@ -318,7 +318,8 @@
 	dduha.dduha_minor = cleanup_minor;
 
 	ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
-	    dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds));
+	    dsl_dataset_user_hold_sync, &dduha,
+	    fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
 	fnvlist_free(dduha.dduha_chkholds);
 
 	return (ret);
@@ -352,7 +353,7 @@
 	objset_t *mos;
 	int numholds;
 
-	if (!dsl_dataset_is_snapshot(ds))
+	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_empty(holds))
@@ -360,7 +361,7 @@
 
 	numholds = 0;
 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zapobj = ds->ds_phys->ds_userrefs_obj;
+	zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
 	holds_found = fnvlist_alloc();
 
 	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
@@ -398,7 +399,8 @@
 		numholds++;
 	}
 
-	if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 &&
+	if (DS_IS_DEFER_DESTROY(ds) &&
+	    dsl_dataset_phys(ds)->ds_num_children == 1 &&
 	    ds->ds_userrefs == numholds) {
 		/* we need to destroy the snapshot as well */
 		if (dsl_dataset_long_held(ds)) {
@@ -484,8 +486,8 @@
 		error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
 		VERIFY(error == 0 || error == ENOENT);
 
-		VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname,
-		    tx));
+		VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    holdname, tx));
 		ds->ds_userrefs--;
 
 		spa_history_log_internal_ds(ds, "release", tx,
@@ -514,7 +516,7 @@
 		    fnvpair_value_nvlist(pair), tx);
 		if (nvlist_exists(ddura->ddura_todelete, name)) {
 			ASSERT(ds->ds_userrefs == 0 &&
-			    ds->ds_phys->ds_num_children == 1 &&
+			    dsl_dataset_phys(ds)->ds_num_children == 1 &&
 			    DS_IS_DEFER_DESTROY(ds));
 			dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
 		}
@@ -565,21 +567,23 @@
 		ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
 		pool = spa_name(tmpdp->dp_spa);
 #ifdef _KERNEL
-		dsl_pool_config_enter(tmpdp, FTAG);
 		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(holds, pair)) {
 			dsl_dataset_t *ds;
 
+			dsl_pool_config_enter(tmpdp, FTAG);
 			error = dsl_dataset_hold_obj_string(tmpdp,
 			    nvpair_name(pair), FTAG, &ds);
 			if (error == 0) {
-				char name[MAXNAMELEN];
+				char name[ZFS_MAX_DATASET_NAME_LEN];
 				dsl_dataset_name(ds, name);
+				dsl_pool_config_exit(tmpdp, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				(void) zfs_unmount_snap(name);
+			} else {
+				dsl_pool_config_exit(tmpdp, FTAG);
 			}
 		}
-		dsl_pool_config_exit(tmpdp, FTAG);
 #endif
 	} else {
 		/* Non-temporary holds are specified by name. */
@@ -599,8 +603,7 @@
 	ddura.ddura_chkholds = fnvlist_alloc();
 
 	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, &ddura,
-	    fnvlist_num_pairs(holds));
+	    dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
 	fnvlist_free(ddura.ddura_todelete);
 	fnvlist_free(ddura.ddura_chkholds);
 
@@ -643,13 +646,13 @@
 		return (err);
 	}
 
-	if (ds->ds_phys->ds_userrefs_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 		zap_attribute_t *za;
 		zap_cursor_t zc;
 
 		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-		    ds->ds_phys->ds_userrefs_obj);
+		    dsl_dataset_phys(ds)->ds_userrefs_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			fnvlist_add_uint64(nvl, za->za_name,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -45,6 +45,8 @@
 static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
     int isize, int osize);
 
+static kmem_cache_t *lz4_ctx_cache;
+
 /*ARGSUSED*/
 size_t
 lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
@@ -841,7 +843,7 @@
 real_LZ4_compress(const char *source, char *dest, int isize, int osize)
 {
 #if HEAPMODE
-	void *ctx = kmem_zalloc(sizeof (struct refTables), KM_NOSLEEP);
+	void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP);
 	int result;
 
 	/*
@@ -851,12 +853,13 @@
 	if (ctx == NULL)
 		return (0);
 
+	bzero(ctx, sizeof(struct refTables));
 	if (isize < LZ4_64KLIMIT)
 		result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
 	else
 		result = LZ4_compressCtx(ctx, source, dest, isize, osize);
 
-	kmem_free(ctx, sizeof (struct refTables));
+	kmem_cache_free(lz4_ctx_cache, ctx);
 	return (result);
 #else
 	if (isize < (int)LZ4_64KLIMIT)
@@ -1002,3 +1005,22 @@
 	_output_error:
 	return (int)(-(((char *)ip) - source));
 }
+
+extern void
+lz4_init(void)
+{
+
+#if HEAPMODE
+	lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+#endif
+}
+
+extern void
+lz4_fini(void)
+{
+
+#if HEAPMODE
+	kmem_cache_destroy(lz4_ctx_cache);
+#endif
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -118,7 +118,9 @@
 			src += 2;
 			if ((cpy = dst - offset) < (uchar_t *)d_start)
 				return (-1);
-			while (--mlen >= 0 && dst < d_end)
+			if (mlen > (d_end - dst))
+				mlen = d_end - dst;
+			while (--mlen >= 0)
 				*dst++ = *cpy++;
 		} else {
 			*dst++ = *src++;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -32,21 +33,26 @@
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
 
-/*
- * Allow allocations to switch to gang blocks quickly. We do this to
- * avoid having to load lots of space_maps in a given txg. There are,
- * however, some cases where we want to avoid "fast" ganging and instead
- * we want to do an exhaustive search of all metaslabs on this device.
- * Currently we don't allow any gang, zil, or dump device related allocations
- * to "fast" gang.
- */
-#define	CAN_FASTGANG(flags) \
-	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
-	METASLAB_GANG_AVOID)))
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
 
+#define	GANG_ALLOCATION(flags) \
+	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
+
+#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
+#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
+#define	METASLAB_ACTIVE_MASK		\
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
+TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
+    &metaslab_gang_bang, 0,
+    "Force gang block allocation for blocks larger than or equal to this value");
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
@@ -55,32 +61,105 @@
  * Values should be greater than or equal to 100.
  */
 int zfs_condense_pct = 200;
+TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+    &zfs_condense_pct, 0,
+    "Condense on-disk spacemap when it is more than this many percents"
+    " of in-memory counterpart");
 
 /*
- * This value defines the number of allowed allocation failures per vdev.
- * If a device reaches this threshold in a given txg then we consider skipping
- * allocations on that device.
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
  */
-int zfs_mg_alloc_failures = 0;
+int zfs_metaslab_condense_block_threshold = 4;
 
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
-    &zfs_mg_alloc_failures, 0,
-    "Number of allowed allocation failures per vdev");
-TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
+    &zfs_mg_noalloc_threshold, 0,
+    "Percentage of metaslab group size that should be free"
+    " to make it eligible for allocation");
 
 /*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmenation metric (measured as a percentage) is less than or equal to
+ * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
+ * then it will be skipped unless all metaslab groups within the metaslab
+ * class have also crossed this threshold.
  */
-static int metaslab_debug = 0;
+int zfs_mg_fragmentation_threshold = 85;
+TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
+    &zfs_mg_fragmentation_threshold, 0,
+    "Percentage of metaslab group size that should be considered "
+    "eligible for allocations unless all metaslab groups within the metaslab class "
+    "have also crossed this threshold");
 
 /*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold",
+    &zfs_metaslab_fragmentation_threshold);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
+    &zfs_metaslab_fragmentation_threshold, 0,
+    "Maximum percentage of metaslab fragmentation level to keep their active state");
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
+    &metaslab_debug_load, 0,
+    "Load all metaslabs when pool is first opened");
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
+TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
+    &metaslab_debug_unload, 0,
+    "Prevent metaslabs from being unloaded");
+
+/*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
+    &metaslab_df_alloc_threshold);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+    &metaslab_df_alloc_threshold, 0,
+    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
 
 /*
  * The minimum free space, in percent, which must be available
@@ -89,6 +168,10 @@
  * switch to using best-fit allocations.
  */
 int metaslab_df_free_pct = 4;
+TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+    &metaslab_df_free_pct, 0,
+    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
 
 /*
  * A metaslab is considered "free" if it contains a contiguous
@@ -95,33 +178,89 @@
  * segment which is greater than metaslab_min_alloc_size.
  */
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
+    &metaslab_min_alloc_size);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
+    &metaslab_min_alloc_size, 0,
+    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
 
 /*
- * Max number of space_maps to prefetch.
+ * Percentage of all cpus that can be used by the metaslab taskq.
  */
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+int metaslab_load_pct = 50;
+TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+    &metaslab_load_pct, 0,
+    "Percentage of cpus that can be used by the metaslab taskq");
 
 /*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
  */
-int metaslab_smo_bonus_pct = 150;
+int metaslab_unload_delay = TXG_SIZE * 2;
+TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
+    &metaslab_unload_delay, 0,
+    "Number of TXGs that an unused metaslab can be kept in memory");
 
 /*
- * Should we be willing to write data to degraded vdevs?
+ * Max number of metaslabs per group to preload.
  */
-boolean_t zfs_write_to_degraded = B_FALSE;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RW,
-    &zfs_write_to_degraded, 0,
-    "Allow writing data to degraded vdevs");
-TUNABLE_INT("vfs.zfs.write_to_degraded", &zfs_write_to_degraded);
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+    &metaslab_preload_limit, 0,
+    "Max number of metaslabs per group to preload");
 
 /*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
+    &metaslab_preload_enabled, 0,
+    "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled",
+    &metaslab_fragmentation_factor_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
+    &metaslab_fragmentation_factor_enabled, 0,
+    "Enable fragmentation weighting on metaslabs");
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+boolean_t metaslab_lba_weighting_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled",
+    &metaslab_lba_weighting_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
+    &metaslab_lba_weighting_enabled, 0,
+    "Enable LBA weighting (i.e. outer tracks are given preference)");
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+boolean_t metaslab_bias_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.bias_enabled",
+    &metaslab_bias_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
+    &metaslab_bias_enabled, 0,
+    "Enable metaslab group biasing");
+
+static uint64_t metaslab_fragmentation(metaslab_t *);
+
+/*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
@@ -130,6 +269,8 @@
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+	refcount_create_tracked(&mc->mc_alloc_slots);
 
 	return (mc);
 }
@@ -143,6 +284,8 @@
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
+	refcount_destroy(&mc->mc_alloc_slots);
+	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -182,6 +325,27 @@
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
+void
+metaslab_class_minblocksize_update(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+	vdev_t *vd;
+	uint64_t minashift = UINT64_MAX;
+
+	if ((mg = mc->mc_rotor) == NULL) {
+		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
+		return;
+	}
+
+	do {
+		vd = mg->mg_vd;
+		if (vd->vdev_ashift < minashift)
+			minashift = vd->vdev_ashift;
+	} while ((mg = mg->mg_next) != mc->mc_rotor);
+
+	mc->mc_minblocksize = 1ULL << minashift;
+}
+
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
@@ -206,7 +370,134 @@
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
+uint64_t
+metaslab_class_get_minblocksize(metaslab_class_t *mc)
+{
+	return (mc->mc_minblocksize);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t *mc_hist;
+	int i;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		/*
+		 * Skip any holes, uninitialized top-levels, or
+		 * vdevs that are not in this metalab class.
+		 */
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			mc_hist[i] += mg->mg_histogram[i];
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+
+	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
 /*
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t fragmentation = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		/*
+		 * Skip any holes, uninitialized top-levels, or
+		 * vdevs that are not in this metalab class.
+		 */
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * If a metaslab group does not contain a fragmentation
+		 * metric then just bail out.
+		 */
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+			return (ZFS_FRAG_INVALID);
+		}
+
+		/*
+		 * Determine how much this metaslab_group is contributing
+		 * to the overall pool fragmentation metric.
+		 */
+		fragmentation += mg->mg_fragmentation *
+		    metaslab_group_get_space(mg);
+	}
+	fragmentation /= metaslab_class_get_space(mc);
+
+	ASSERT3U(fragmentation, <=, 100);
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
+ */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t space = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * Calculate if we have enough space to add additional
+		 * metaslabs. We report the expandable space in terms
+		 * of the metaslab size since that's the unit of expansion.
+		 */
+		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (space);
+}
+
+/*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
@@ -225,9 +516,9 @@
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
-	if (m1->ms_map->sm_start < m2->ms_map->sm_start)
+	if (m1->ms_start < m2->ms_start)
 		return (-1);
-	if (m1->ms_map->sm_start > m2->ms_map->sm_start)
+	if (m1->ms_start > m2->ms_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
@@ -235,6 +526,87 @@
 	return (0);
 }
 
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_class_t *mc = mg->mg_class;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	boolean_t was_allocatable;
+	boolean_t was_initialized;
+
+	ASSERT(vd == vd->vdev_top);
+
+	mutex_enter(&mg->mg_lock);
+	was_allocatable = mg->mg_allocatable;
+	was_initialized = mg->mg_initialized;
+
+	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+	    (vs->vs_space + 1);
+
+	mutex_enter(&mc->mc_lock);
+
+	/*
+	 * If the metaslab group was just added then it won't
+	 * have any space until we finish syncing out this txg.
+	 * At that point we will consider it initialized and available
+	 * for allocations.  We also don't consider non-activated
+	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
+	 * to be initialized, because they can't be used for allocation.
+	 */
+	mg->mg_initialized = metaslab_group_initialized(mg);
+	if (!was_initialized && mg->mg_initialized) {
+		mc->mc_groups++;
+	} else if (was_initialized && !mg->mg_initialized) {
+		ASSERT3U(mc->mc_groups, >, 0);
+		mc->mc_groups--;
+	}
+	if (mg->mg_initialized)
+		mg->mg_no_free_space = B_FALSE;
+
+	/*
+	 * A metaslab group is considered allocatable if it has plenty
+	 * of free space or is not heavily fragmented. We only take
+	 * fragmentation into account if the metaslab group has a valid
+	 * fragmentation metric (i.e. a value between 0 and 100).
+	 */
+	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+	/*
+	 * The mc_alloc_groups maintains a count of the number of
+	 * groups in this metaslab class that are still above the
+	 * zfs_mg_noalloc_threshold. This is used by the allocating
+	 * threads to determine if they should avoid allocations to
+	 * a given group. The allocator will avoid allocations to a group
+	 * if that group has reached or is below the zfs_mg_noalloc_threshold
+	 * and there are still other groups that are above the threshold.
+	 * When a group transitions from allocatable to non-allocatable or
+	 * vice versa we update the metaslab class to reflect that change.
+	 * When the mc_alloc_groups value drops to 0 that means that all
+	 * groups have reached the zfs_mg_noalloc_threshold making all groups
+	 * eligible for allocations. This effectively means that all devices
+	 * are balanced again.
+	 */
+	if (was_allocatable && !mg->mg_allocatable)
+		mc->mc_alloc_groups--;
+	else if (!was_allocatable && mg->mg_allocatable)
+		mc->mc_alloc_groups++;
+	mutex_exit(&mc->mc_lock);
+
+	mutex_exit(&mg->mg_lock);
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
@@ -247,7 +619,13 @@
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
+	mg->mg_initialized = B_FALSE;
+	mg->mg_no_free_space = B_TRUE;
+	refcount_create_tracked(&mg->mg_alloc_queue_depth);
 
+	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
 	return (mg);
 }
 
@@ -263,8 +641,10 @@
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
+	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
+	refcount_destroy(&mg->mg_alloc_queue_depth);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
@@ -285,6 +665,7 @@
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_rotor) == NULL) {
 		mg->mg_prev = mg;
@@ -297,6 +678,7 @@
 		mgnext->mg_prev = mg;
 	}
 	mc->mc_rotor = mg;
+	metaslab_class_minblocksize_update(mc);
 }
 
 void
@@ -315,6 +697,9 @@
 		return;
 	}
 
+	taskq_wait(mg->mg_taskq);
+	metaslab_group_alloc_update(mg);
+
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
@@ -328,22 +713,125 @@
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
+	metaslab_class_minblocksize_update(mc);
 }
 
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+	uint64_t *mg_hist;
+	vdev_t *vd = mg->mg_vd;
+	uint64_t ashift = vd->vdev_ashift;
+	int i;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_sm == NULL)
+			continue;
+
+		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+			mg_hist[i + ashift] +=
+			    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
 static void
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
+	mutex_enter(&mg->mg_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		mg->mg_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
+	mutex_enter(&mg->mg_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		ASSERT3U(mg->mg_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+		ASSERT3U(mc->mc_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+
+		mg->mg_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
+	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == NULL);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_add(mg, msp);
+	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_remove(mg, msp);
+	mutex_exit(&msp->ms_lock);
+
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
@@ -356,9 +844,9 @@
 {
 	/*
 	 * Although in principle the weight can be any value, in
-	 * practice we do not use values in the range [1, 510].
+	 * practice we do not use values in the range [1, 511].
 	 */
-	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
@@ -370,26 +858,162 @@
 }
 
 /*
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
+ */
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	uint64_t fragmentation = 0;
+	uint64_t valid_ms = 0;
+
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+			continue;
+
+		valid_ms++;
+		fragmentation += msp->ms_fragmentation;
+	}
+
+	if (valid_ms <= vd->vdev_ms_count / 2)
+		return (ZFS_FRAG_INVALID);
+
+	fragmentation /= valid_ms;
+	ASSERT3U(fragmentation, <=, 100);
+	return (fragmentation);
+}
+
+/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+    uint64_t psize)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_class_t *mc = mg->mg_class;
+
+	/*
+	 * We can only consider skipping this metaslab group if it's
+	 * in the normal metaslab class and there are other metaslab
+	 * groups to select from. Otherwise, we always consider it eligible
+	 * for allocations.
+	 */
+	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+		return (B_TRUE);
+
+	/*
+	 * If the metaslab group's mg_allocatable flag is set (see comments
+	 * in metaslab_group_alloc_update() for more information) and
+	 * the allocation throttle is disabled then allow allocations to this
+	 * device. However, if the allocation throttle is enabled then
+	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
+	 * to determine if we should allow allocations to this metaslab group.
+	 * If all metaslab groups are no longer considered allocatable
+	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+	 * gang block size then we allow allocations on this metaslab group
+	 * regardless of the mg_allocatable or throttle settings.
+	 */
+	if (mg->mg_allocatable) {
+		metaslab_group_t *mgp;
+		int64_t qdepth;
+		uint64_t qmax = mg->mg_max_alloc_queue_depth;
+
+		if (!mc->mc_alloc_throttle_enabled)
+			return (B_TRUE);
+
+		/*
+		 * If this metaslab group does not have any free space, then
+		 * there is no point in looking further.
+		 */
+		if (mg->mg_no_free_space)
+			return (B_FALSE);
+
+		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+
+		/*
+		 * If this metaslab group is below its qmax or it's
+		 * the only allocatable metasable group, then attempt
+		 * to allocate from it.
+		 */
+		if (qdepth < qmax || mc->mc_alloc_groups == 1)
+			return (B_TRUE);
+		ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+		/*
+		 * Since this metaslab group is at or over its qmax, we
+		 * need to determine if there are metaslab groups after this
+		 * one that might be able to handle this allocation. This is
+		 * racy since we can't hold the locks for all metaslab
+		 * groups at the same time when we make this check.
+		 */
+		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+			qmax = mgp->mg_max_alloc_queue_depth;
+
+			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+
+			/*
+			 * If there is another metaslab group that
+			 * might be able to handle the allocation, then
+			 * we return false so that we skip this group.
+			 */
+			if (qdepth < qmax && !mgp->mg_no_free_space)
+				return (B_FALSE);
+		}
+
+		/*
+		 * We didn't find another group to handle the allocation
+		 * so we can't skip this metaslab group even though
+		 * we are at or over our qmax.
+		 */
+		return (B_TRUE);
+
+	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
  * ==========================================================================
- * Common allocator routines
+ * Range tree callbacks
  * ==========================================================================
  */
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
 static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+metaslab_rangesize_compare(const void *x1, const void *x2)
 {
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+	const range_seg_t *r1 = x1;
+	const range_seg_t *r2 = x2;
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
-	if (ss_size1 < ss_size2)
+	if (rs_size1 < rs_size2)
 		return (-1);
-	if (ss_size1 > ss_size2)
+	if (rs_size1 > rs_size2)
 		return (1);
 
-	if (s1->ss_start < s2->ss_start)
+	if (r1->rs_start < r2->rs_start)
 		return (-1);
-	if (s1->ss_start > s2->ss_start)
+
+	if (r1->rs_start > r2->rs_start)
 		return (1);
 
 	return (0);
@@ -396,137 +1020,195 @@
 }
 
 /*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
 {
-	space_seg_t *ss, ssearch;
-	avl_index_t where;
+	metaslab_t *msp = arg;
 
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT(msp->ms_tree == NULL);
 
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL)
-		ss = avl_nearest(t, where, AVL_AFTER);
-
-	while (ss != NULL) {
-		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
-		if (offset + size <= ss->ss_end) {
-			*cursor = offset + size;
-			return (offset);
-		}
-		ss = AVL_NEXT(t, ss);
-	}
-
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
-
-	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+/*
+ * Destroy the block allocator specific components.
+ */
 static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
-	space_seg_t *ss;
+	metaslab_t *msp = arg;
 
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT0(avl_numnodes(&msp->ms_size_tree));
 
-	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		avl_add(sm->sm_pp_root, ss);
+	avl_destroy(&msp->ms_size_tree);
 }
 
 static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	void *cookie = NULL;
+	metaslab_t *msp = arg;
 
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
-
-	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-		/* tear down the tree */
-	}
-
-	avl_destroy(sm->sm_pp_root);
-	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-	sm->sm_pp_root = NULL;
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_add(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_remove(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+
+	/*
+	 * Normally one would walk the tree freeing nodes along the way.
+	 * Since the nodes are shared with the range trees we can avoid
+	 * walking all nodes and just reinitialize the avl tree. The nodes
+	 * will be freed by the range tree, so we don't want to free them here.
+	 */
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+static range_tree_ops_t metaslab_rt_ops = {
+	metaslab_rt_create,
+	metaslab_rt_destroy,
+	metaslab_rt_add,
+	metaslab_rt_remove,
+	metaslab_rt_vacate
+};
+
 /*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+/*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
 {
-	avl_tree_t *t = sm->sm_pp_root;
-	space_seg_t *ss;
+	avl_tree_t *t = &msp->ms_size_tree;
+	range_seg_t *rs;
 
-	if (t == NULL || (ss = avl_last(t)) == NULL)
+	if (t == NULL || (rs = avl_last(t)) == NULL)
 		return (0ULL);
 
-	return (ss->ss_end - ss->ss_start);
+	return (rs->rs_end - rs->rs_start);
 }
 
+uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+{
+	uint64_t start;
+	range_tree_t *rt = msp->ms_tree;
+
+	VERIFY(!msp->ms_condensing);
+
+	start = msp->ms_ops->msop_alloc(msp, size);
+	if (start != -1ULL) {
+		vdev_t *vd = msp->ms_group->mg_vd;
+
+		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+		range_tree_remove(rt, start, size);
+	}
+	return (start);
+}
+
 /*
  * ==========================================================================
- * The first-fit block allocator
+ * Common allocator routines
  * ==========================================================================
  */
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	range_seg_t *rs, rsearch;
+	avl_index_t where;
 
+	rsearch.rs_start = *cursor;
+	rsearch.rs_end = *cursor + size;
+
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL)
+		rs = avl_nearest(t, where, AVL_AFTER);
+
+	while (rs != NULL) {
+		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+		if (offset + size <= rs->rs_end) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		rs = AVL_NEXT(t, rs);
+	}
+
+	/*
+	 * If we know we've searched the whole map (*cursor == 0), give up.
+	 * Otherwise, reset the cursor to the beginning and try again.
+	 */
+	if (*cursor == 0)
+		return (-1ULL);
+
+	*cursor = 0;
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
-/* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
-	return (B_TRUE);
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
+	uint64_t align = size & -size;
+	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+	avl_tree_t *t = &msp->ms_tree->rt_root;
+
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_ff_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_ff_fragmented
+static metaslab_ops_t metaslab_ff_ops = {
+	metaslab_ff_alloc
 };
 
 /*
@@ -538,16 +1220,24 @@
  * ==========================================================================
  */
 static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
 	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
+	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &rt->rt_root;
+	uint64_t max_size = metaslab_block_maxsize(msp);
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
@@ -558,7 +1248,7 @@
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = sm->sm_pp_root;
+		t = &msp->ms_size_tree;
 		*cursor = 0;
 	}
 
@@ -565,158 +1255,113 @@
 	return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
-static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
-
-	if (max_size >= metaslab_df_alloc_threshold &&
-	    free_pct >= metaslab_df_free_pct)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-static space_map_ops_t metaslab_df_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_df_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_df_fragmented
+static metaslab_ops_t metaslab_df_ops = {
+	metaslab_df_alloc
 };
 
 /*
  * ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
-	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	uint64_t rsize = size;
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &msp->ms_size_tree;
+	uint64_t *cursor = &msp->ms_lbas[0];
+	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
-	if (max_size < size)
-		return (-1ULL);
+	ASSERT3U(*cursor_end, >=, *cursor);
 
-	ASSERT3U(*extent_end, >=, *cursor);
+	if ((*cursor + size) > *cursor_end) {
+		range_seg_t *rs;
 
-	/*
-	 * If we're running low on space switch to using the size
-	 * sorted AVL tree (best-fit).
-	 */
-	if ((*cursor + size) > *extent_end) {
+		rs = avl_last(&msp->ms_size_tree);
+		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+			return (-1ULL);
 
-		t = sm->sm_pp_root;
-		*cursor = *extent_end = 0;
-
-		if (max_size > 2 * SPA_MAXBLOCKSIZE)
-			rsize = MIN(metaslab_min_alloc_size, max_size);
-		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
-		if (offset != -1)
-			*cursor = offset + size;
-	} else {
-		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+		*cursor = rs->rs_start;
+		*cursor_end = rs->rs_end;
 	}
-	ASSERT3U(*cursor, <=, *extent_end);
-	return (offset);
-}
 
-static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
+	offset = *cursor;
+	*cursor += size;
 
-	if (max_size > (metaslab_min_alloc_size * 10))
-		return (B_FALSE);
-	return (B_TRUE);
+	return (offset);
 }
 
-static space_map_ops_t metaslab_cdf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_cdf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+	metaslab_cf_alloc
 };
 
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	avl_tree_t *t = &msp->ms_tree->rt_root;
 	avl_index_t where;
-	space_seg_t *ss, ssearch;
-	uint64_t hbit = highbit(size);
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
+	range_seg_t *rs, rsearch;
+	uint64_t hbit = highbit64(size);
+	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+	uint64_t max_size = metaslab_block_maxsize(msp);
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
 
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
+	rsearch.rs_start = *cursor;
+	rsearch.rs_end = *cursor + size;
 
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
-		t = sm->sm_pp_root;
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+		t = &msp->ms_size_tree;
 
-		ssearch.ss_start = 0;
-		ssearch.ss_end = MIN(max_size,
+		rsearch.rs_start = 0;
+		rsearch.rs_end = MIN(max_size,
 		    1ULL << (hbit + metaslab_ndf_clump_shift));
-		ss = avl_find(t, &ssearch, &where);
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-		ASSERT(ss != NULL);
+		rs = avl_find(t, &rsearch, &where);
+		if (rs == NULL)
+			rs = avl_nearest(t, where, AVL_AFTER);
+		ASSERT(rs != NULL);
 	}
 
-	if (ss != NULL) {
-		if (ss->ss_start + size <= ss->ss_end) {
-			*cursor = ss->ss_start + size;
-			return (ss->ss_start);
-		}
+	if ((rs->rs_end - rs->rs_start) >= size) {
+		*cursor = rs->rs_start + size;
+		return (rs->rs_start);
 	}
 	return (-1ULL);
 }
 
-static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-
-	if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
-		return (B_FALSE);
-	return (B_TRUE);
-}
-
-
-static space_map_ops_t metaslab_ndf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_ndf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_ndf_fragmented
+static metaslab_ops_t metaslab_ndf_ops = {
+	metaslab_ndf_alloc
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
@@ -723,38 +1368,109 @@
  * Metaslabs
  * ==========================================================================
  */
-metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-	uint64_t start, uint64_t size, uint64_t txg)
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
 {
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_loaded);
+	ASSERT(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+
+	/*
+	 * If the space map has not been allocated yet, then treat
+	 * all the space in the metaslab as free and add it to the
+	 * ms_tree.
+	 */
+	if (msp->ms_sm != NULL)
+		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+	else
+		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+	msp->ms_loaded = (error == 0);
+	msp->ms_loading = B_FALSE;
+
+	if (msp->ms_loaded) {
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defertree[t],
+			    range_tree_remove, msp->ms_tree);
+		}
+	}
+	cv_broadcast(&msp->ms_load_cv);
+	return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	range_tree_vacate(msp->ms_tree, NULL, NULL);
+	msp->ms_loaded = B_FALSE;
+	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
+    metaslab_t **msp)
+{
 	vdev_t *vd = mg->mg_vd;
-	metaslab_t *msp;
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+	metaslab_t *ms;
+	int error;
 
-	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
-	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	ms->ms_id = id;
+	ms->ms_start = id << vd->vdev_ms_shift;
+	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 
-	msp->ms_smo_syncing = *smo;
+	/*
+	 * We only open space map objects that already exist. All others
+	 * will be opened when we finally allocate an object for it.
+	 */
+	if (object != 0) {
+		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
 
+		if (error != 0) {
+			kmem_free(ms, sizeof (metaslab_t));
+			return (error);
+		}
+
+		ASSERT(ms->ms_sm != NULL);
+	}
+
 	/*
-	 * We create the main space map here, but we don't create the
-	 * allocmaps and freemaps until metaslab_sync_done().  This serves
+	 * We create the main range tree here, but we don't create the
+	 * alloctree and freetree until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
-	space_map_create(msp->ms_map, start, size,
-	    vd->vdev_ashift, &msp->ms_lock);
+	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+	metaslab_group_add(mg, ms);
 
-	metaslab_group_add(mg, msp);
+	ms->ms_fragmentation = metaslab_fragmentation(ms);
+	ms->ms_ops = mg->mg_class->mc_ops;
 
-	if (metaslab_debug && smo->smo_object != 0) {
-		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
-		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
-		mutex_exit(&msp->ms_lock);
-	}
-
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
@@ -762,14 +1478,27 @@
 	 * does not become available until after this txg has synced.
 	 */
 	if (txg <= TXG_INITIAL)
-		metaslab_sync_done(msp, 0);
+		metaslab_sync_done(ms, 0);
 
+	/*
+	 * If metaslab_debug_load is set and we're initializing a metaslab
+	 * that has an allocated space_map object then load the its space
+	 * map so that can verify frees.
+	 */
+	if (metaslab_debug_load && ms->ms_sm != NULL) {
+		mutex_enter(&ms->ms_lock);
+		VERIFY0(metaslab_load(ms));
+		mutex_exit(&ms->ms_lock);
+	}
+
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
+		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
-	return (msp);
+	*msp = ms;
+
+	return (0);
 }
 
 void
@@ -777,48 +1506,149 @@
 {
 	metaslab_group_t *mg = msp->ms_group;
 
-	vdev_space_update(mg->mg_vd,
-	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
-
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_unload(msp->ms_map);
-	space_map_destroy(msp->ms_map);
-	kmem_free(msp->ms_map, sizeof (*msp->ms_map));
+	VERIFY(msp->ms_group == NULL);
+	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+	    0, -msp->ms_size);
+	space_map_close(msp->ms_sm);
 
+	metaslab_unload(msp);
+	range_tree_destroy(msp->ms_tree);
+
 	for (int t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(msp->ms_allocmap[t]);
-		space_map_destroy(msp->ms_freemap[t]);
-		kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
-		kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
+		range_tree_destroy(msp->ms_alloctree[t]);
+		range_tree_destroy(msp->ms_freetree[t]);
 	}
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		space_map_destroy(msp->ms_defermap[t]);
-		kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+		range_tree_destroy(msp->ms_defertree[t]);
 	}
 
 	ASSERT0(msp->ms_deferspace);
 
 	mutex_exit(&msp->ms_lock);
+	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
-#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define	FRAGMENTATION_TABLE_SIZE	17
 
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmetation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+	100,	/* 512B	*/
+	100,	/* 1K	*/
+	98,	/* 2K	*/
+	95,	/* 4K	*/
+	90,	/* 8K	*/
+	80,	/* 16K	*/
+	70,	/* 32K	*/
+	60,	/* 64K	*/
+	50,	/* 128K	*/
+	40,	/* 256K	*/
+	30,	/* 512K	*/
+	20,	/* 1M	*/
+	15,	/* 2M	*/
+	10,	/* 4M	*/
+	5,	/* 8M	*/
+	0	/* 16M	*/
+};
+
+/*
+ * Calclate the metaslab's fragmentation metric. A return value
+ * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
+ * not support this metric. Otherwise, the return value should be in the
+ * range [0, 100].
+ */
 static uint64_t
+metaslab_fragmentation(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t fragmentation = 0;
+	uint64_t total = 0;
+	boolean_t feature_enabled = spa_feature_is_enabled(spa,
+	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+	if (!feature_enabled)
+		return (ZFS_FRAG_INVALID);
+
+	/*
+	 * A null space map means that the entire metaslab is free
+	 * and thus is not fragmented.
+	 */
+	if (msp->ms_sm == NULL)
+		return (0);
+
+	/*
+	 * If this metaslab's space_map has not been upgraded, flag it
+	 * so that we upgrade next time we encounter it.
+	 */
+	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+		uint64_t txg = spa_syncing_txg(spa);
+		vdev_t *vd = msp->ms_group->mg_vd;
+
+		if (spa_writeable(spa)) {
+			msp->ms_condense_wanted = B_TRUE;
+			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
+			    "msp %p, vd %p", txg, msp, vd);
+		}
+		return (ZFS_FRAG_INVALID);
+	}
+
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		uint64_t space = 0;
+		uint8_t shift = msp->ms_sm->sm_shift;
+		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+		    FRAGMENTATION_TABLE_SIZE - 1);
+
+		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+			continue;
+
+		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+		total += space;
+
+		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+		fragmentation += space * zfs_frag_table[idx];
+	}
+
+	if (total > 0)
+		fragmentation /= total;
+	ASSERT3U(fragmentation, <=, 100);
+	return (fragmentation);
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
+static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
@@ -829,7 +1659,7 @@
 	 * for us to do here.
 	 */
 	if (vd->vdev_removing) {
-		ASSERT0(smo->smo_alloc);
+		ASSERT0(space_map_allocated(msp->ms_sm));
 		ASSERT0(vd->vdev_ms_shift);
 		return (0);
 	}
@@ -837,7 +1667,30 @@
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
-	space = sm->sm_size - smo->smo_alloc;
+	space = msp->ms_size - space_map_allocated(msp->ms_sm);
+
+	msp->ms_fragmentation = metaslab_fragmentation(msp);
+	if (metaslab_fragmentation_factor_enabled &&
+	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+		/*
+		 * Use the fragmentation information to inversely scale
+		 * down the baseline weight. We need to ensure that we
+		 * don't exclude this metaslab completely when it's 100%
+		 * fragmented. To avoid this we reduce the fragmented value
+		 * by 1.
+		 */
+		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+		/*
+		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+		 * this metaslab again. The fragmentation metric may have
+		 * decreased the space to something smaller than
+		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+		 * so that we can consume any remaining space.
+		 */
+		if (space > 0 && space < SPA_MINBLOCKSIZE)
+			space = SPA_MINBLOCKSIZE;
+	}
 	weight = space;
 
 	/*
@@ -849,100 +1702,44 @@
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
-	weight = 2 * weight -
-	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
-	ASSERT(weight >= space && weight <= 2 * space);
+	if (metaslab_lba_weighting_enabled) {
+		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+		ASSERT(weight >= space && weight <= 2 * space);
+	}
 
 	/*
-	 * For locality, assign higher weight to metaslabs which have
-	 * a lower offset than what we've already activated.
+	 * If this metaslab is one we're actively using, adjust its
+	 * weight to make it preferable to any inactive metaslab so
+	 * we'll polish it off. If the fragmentation on this metaslab
+	 * has exceed our threshold, then don't mark it active.
 	 */
-	if (sm->sm_start <= mg->mg_bonus_area)
-		weight *= (metaslab_smo_bonus_pct / 100);
-	ASSERT(weight >= space &&
-	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
-
-	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
-		/*
-		 * If this metaslab is one we're actively using, adjust its
-		 * weight to make it preferable to any inactive metaslab so
-		 * we'll polish it off.
-		 */
+	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
+
 	return (weight);
 }
 
-static void
-metaslab_prefetch(metaslab_group_t *mg)
-{
-	spa_t *spa = mg->mg_vd->vdev_spa;
-	metaslab_t *msp;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-	int m;
-
-	mutex_enter(&mg->mg_lock);
-
-	/*
-	 * Prefetch the next potential metaslabs
-	 */
-	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
-		space_map_t *sm = msp->ms_map;
-		space_map_obj_t *smo = &msp->ms_smo;
-
-		/* If we have reached our prefetch limit then we're done */
-		if (m >= metaslab_prefetch_limit)
-			break;
-
-		if (!sm->sm_loaded && smo->smo_object != 0) {
-			mutex_exit(&mg->mg_lock);
-			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
-			    0ULL, smo->smo_objsize);
-			mutex_enter(&mg->mg_lock);
-		}
-	}
-	mutex_exit(&mg->mg_lock);
-}
-
 static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
-	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = msp->ms_map;
-	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
-
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		space_map_load_wait(sm);
-		if (!sm->sm_loaded) {
-			space_map_obj_t *smo = &msp->ms_smo;
-
-			int error = space_map_load(sm, sm_ops, SM_FREE, smo,
-			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
-			if (error)  {
+		metaslab_load_wait(msp);
+		if (!msp->ms_loaded) {
+			int error = metaslab_load(msp);
+			if (error) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
-			for (int t = 0; t < TXG_DEFER_SIZE; t++)
-				space_map_walk(msp->ms_defermap[t],
-				    space_map_claim, sm);
-
 		}
 
-		/*
-		 * Track the bonus area as we activate new metaslabs.
-		 */
-		if (sm->sm_start > mg->mg_bonus_area) {
-			mutex_enter(&mg->mg_lock);
-			mg->mg_bonus_area = sm->sm_start;
-			mutex_exit(&mg->mg_lock);
-		}
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
-	ASSERT(sm->sm_loaded);
+	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
@@ -956,26 +1753,102 @@
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
+	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
+static void
+metaslab_preload(void *arg)
+{
+	metaslab_t *msp = arg;
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_load_wait(msp);
+	if (!msp->ms_loaded)
+		(void) metaslab_load(msp);
+
+	/*
+	 * Set the ms_access_txg value so that we don't unload it right away.
+	 */
+	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
+	mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	int m = 0;
+
+	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+		taskq_wait(mg->mg_taskq);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+	/*
+	 * Load the next potential metaslabs
+	 */
+	msp = avl_first(t);
+	while (msp != NULL) {
+		metaslab_t *msp_next = AVL_NEXT(t, msp);
+
+		/*
+		 * We preload only the maximum number of metaslabs specified
+		 * by metaslab_preload_limit. If a metaslab is being forced
+		 * to condense then we preload it too. This will ensure
+		 * that force condensing happens in the next txg.
+		 */
+		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+			msp = msp_next;
+			continue;
+		}
+
+		/*
+		 * We must drop the metaslab group lock here to preserve
+		 * lock ordering with the ms_lock (when grabbing both
+		 * the mg_lock and the ms_lock, the ms_lock must be taken
+		 * first).  As a result, it is possible that the ordering
+		 * of the metaslabs within the avl tree may change before
+		 * we reacquire the lock. The metaslab cannot be removed from
+		 * the tree while we're in syncing context so it is safe to
+		 * drop the mg_lock here. If the metaslabs are reordered
+		 * nothing will break -- we just may end up loading a
+		 * less than optimal one.
+		 */
+		mutex_exit(&mg->mg_lock);
+		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+		    msp, TQ_SLEEP) != 0);
+		mutex_enter(&mg->mg_lock);
+		msp = msp_next;
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
 /*
- * Determine if the in-core space map representation can be condensed on-disk.
- * We would like to use the following criteria to make our decision:
+ * Determine if the space map's on-disk footprint is past our tolerance
+ * for inefficiency. We would like to use the following criteria to make
+ * our decision:
  *
  * 1. The size of the space map object should not dramatically increase as a
- * result of writing out our in-core free map.
+ * result of writing out the free space range tree.
  *
  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the in-core representation (i.e. zfs_condense_pct = 110
- * and in-core = 1MB, minimal = 1.1.MB).
+ * times the size than the free space range tree representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
  *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
  * Checking the first condition is tricky since we don't want to walk
  * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered AVL tree in the space map and calculate the
- * size required for the largest segment in our in-core free map. If the
+ * use the size-ordered range tree in the metaslab and calculate the
+ * size required to write out the largest segment in our free tree. If the
  * size required to represent that segment on disk is larger than the space
  * map object then we avoid condensing this map.
  *
@@ -982,25 +1855,33 @@
  * To determine the second criterion we use a best-case estimate and assume
  * each segment can be represented on-disk as a single 64-bit entry. We refer
  * to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
-	space_map_t *sm = msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo_syncing;
-	space_seg_t *ss;
-	uint64_t size, entries, segsz;
+	space_map_t *sm = msp->ms_sm;
+	range_seg_t *rs;
+	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+	dmu_object_info_t doi;
+	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(sm->sm_loaded);
+	ASSERT(msp->ms_loaded);
 
 	/*
-	 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
-	 * the largest segment in the in-core free map. If the tree is
-	 * empty then we should condense the map.
+	 * Use the ms_size_tree range tree, which is ordered by size, to
+	 * obtain the largest segment in the free tree. We always condense
+	 * metaslabs that are empty and metaslabs for which a condense
+	 * request has been made.
 	 */
-	ss = avl_last(sm->sm_pp_root);
-	if (ss == NULL)
+	rs = avl_last(&msp->ms_size_tree);
+	if (rs == NULL || msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	/*
@@ -1009,102 +1890,106 @@
 	 * larger on-disk than the entire current on-disk structure, then
 	 * clearly condensing will increase the on-disk structure size.
 	 */
-	size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
+	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);
 
-	return (segsz <= smo->smo_objsize &&
-	    smo->smo_objsize >= (zfs_condense_pct *
-	    sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
+	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	object_size = space_map_length(msp->ms_sm);
+
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
+	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+	return (segsz <= object_size &&
+	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed by
- * the in-core free map.
+ * the entries of the free range tree.
  */
 static void
 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
-	space_map_t condense_map;
-	space_map_t *sm = msp->ms_map;
-	objset_t *mos = spa_meta_objset(spa);
-	space_map_obj_t *smo = &msp->ms_smo_syncing;
+	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
+	range_tree_t *condense_tree;
+	space_map_t *sm = msp->ms_sm;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
-	ASSERT(sm->sm_loaded);
+	ASSERT(msp->ms_loaded);
 
-	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
-	    "smo size %llu, segments %lu", txg,
-	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
-	    smo->smo_objsize, avl_numnodes(&sm->sm_root));
 
+	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+	    msp->ms_group->mg_vd->vdev_spa->spa_name,
+	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+	msp->ms_condense_wanted = B_FALSE;
+
 	/*
-	 * Create an map that is a 100% allocated map. We remove segments
+	 * Create an range tree that is 100% allocated. We remove segments
 	 * that have been freed in this txg, any deferred frees that exist,
 	 * and any allocation in the future. Removing segments should be
-	 * a relatively inexpensive operation since we expect these maps to
-	 * a small number of nodes.
+	 * a relatively inexpensive operation since we expect these trees to
+	 * have a small number of nodes.
 	 */
-	space_map_create(&condense_map, sm->sm_start, sm->sm_size,
-	    sm->sm_shift, sm->sm_lock);
-	space_map_add(&condense_map, condense_map.sm_start,
-	    condense_map.sm_size);
+	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
+	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
 	/*
-	 * Remove what's been freed in this txg from the condense_map.
+	 * Remove what's been freed in this txg from the condense_tree.
 	 * Since we're in sync_pass 1, we know that all the frees from
-	 * this txg are in the freemap.
+	 * this txg are in the freetree.
 	 */
-	space_map_walk(freemap, space_map_remove, &condense_map);
+	range_tree_walk(freetree, range_tree_remove, condense_tree);
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		space_map_walk(msp->ms_defermap[t],
-		    space_map_remove, &condense_map);
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_walk(msp->ms_defertree[t],
+		    range_tree_remove, condense_tree);
+	}
 
-	for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-		space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
-		    space_map_remove, &condense_map);
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+		    range_tree_remove, condense_tree);
+	}
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing
 	 * other consumers to change it's content. Set the
-	 * space_map's sm_condensing flag to ensure that
+	 * metaslab's ms_condensing flag to ensure that
 	 * allocations on this metaslab do not occur while we're
 	 * in the middle of committing it to disk. This is only critical
-	 * for the ms_map as all other space_maps use per txg
+	 * for the ms_tree as all other range trees use per txg
 	 * views of their content.
 	 */
-	sm->sm_condensing = B_TRUE;
+	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
-	space_map_truncate(smo, mos, tx);
+	space_map_truncate(sm, tx);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * While we would ideally like to create a space_map representation
 	 * that consists only of allocation records, doing so can be
-	 * prohibitively expensive because the in-core free map can be
+	 * prohibitively expensive because the in-core free tree can be
 	 * large, and therefore computationally expensive to subtract
-	 * from the condense_map. Instead we sync out two maps, a cheap
-	 * allocation only map followed by the in-core free map. While not
+	 * from the condense_tree. Instead we sync out two trees, a cheap
+	 * allocation only tree followed by the in-core free tree. While not
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
-	space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
-	space_map_vacate(&condense_map, NULL, NULL);
-	space_map_destroy(&condense_map);
+	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	range_tree_vacate(condense_tree, NULL, NULL);
+	range_tree_destroy(condense_tree);
 
-	space_map_sync(sm, SM_FREE, smo, mos, tx);
-	sm->sm_condensing = B_FALSE;
-
-	spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
-	    "smo size %llu", txg,
-	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
-	    smo->smo_objsize);
+	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+	msp->ms_condensing = B_FALSE;
 }
 
 /*
@@ -1113,16 +1998,16 @@
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
-	vdev_t *vd = msp->ms_group->mg_vd;
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *sm = msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo_syncing;
-	dmu_buf_t *db;
+	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
+	range_tree_t **freed_tree =
+	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
 	dmu_tx_t *tx;
+	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -1129,78 +2014,114 @@
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
-	if (*freemap == NULL) {
-		ASSERT3P(allocmap, ==, NULL);
+	if (*freetree == NULL) {
+		ASSERT3P(alloctree, ==, NULL);
 		return;
 	}
 
-	ASSERT3P(allocmap, !=, NULL);
-	ASSERT3P(*freemap, !=, NULL);
-	ASSERT3P(*freed_map, !=, NULL);
+	ASSERT3P(alloctree, !=, NULL);
+	ASSERT3P(*freetree, !=, NULL);
+	ASSERT3P(*freed_tree, !=, NULL);
 
-	if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
+	/*
+	 * Normally, we don't want to process a metaslab if there
+	 * are no allocations or frees to perform. However, if the metaslab
+	 * is being forced to condense we need to let it through.
+	 */
+	if (range_tree_space(alloctree) == 0 &&
+	    range_tree_space(*freetree) == 0 &&
+	    !msp->ms_condense_wanted)
 		return;
 
 	/*
 	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
-	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
-	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
-	 * We drop it whenever we call into the DMU, because the DMU
-	 * can call down to us (e.g. via zio_free()) at any time.
+	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
+	 * be modifying this txg's alloctree, freetree, freed_tree, or
+	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
+	 * space_map ASSERTs. We drop it whenever we call into the DMU,
+	 * because the DMU can call down to us (e.g. via zio_free()) at
+	 * any time.
 	 */
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    (sm->sm_start >> vd->vdev_ms_shift),
-		    sizeof (uint64_t), &smo->smo_object, tx);
+	if (msp->ms_sm == NULL) {
+		uint64_t new_object;
+
+		new_object = space_map_alloc(mos, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
+		    &msp->ms_lock));
+		ASSERT(msp->ms_sm != NULL);
 	}
 
 	mutex_enter(&msp->ms_lock);
 
-	if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
+	/*
+	 * Note: metaslab_condense() clears the space_map's histogram.
+	 * Therefore we must verify and remove this histogram before
+	 * condensing.
+	 */
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
+	metaslab_group_histogram_remove(mg, msp);
+
+	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
 	    metaslab_should_condense(msp)) {
 		metaslab_condense(msp, txg, tx);
 	} else {
-		space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
-		space_map_sync(*freemap, SM_FREE, smo, mos, tx);
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
+		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
 	}
 
-	space_map_vacate(allocmap, NULL, NULL);
+	if (msp->ms_loaded) {
+		/*
+		 * When the space map is loaded, we have an accruate
+		 * histogram in the range tree. This gives us an opportunity
+		 * to bring the space map's histogram up-to-date so we clear
+		 * it first before updating it.
+		 */
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+	} else {
+		/*
+		 * Since the space map is not loaded we simply update the
+		 * exisiting histogram with what was freed in this txg. This
+		 * means that the on-disk histogram may not have an accurate
+		 * view of the free space but it's close enough to allow
+		 * us to make allocation decisions.
+		 */
+		space_map_histogram_add(msp->ms_sm, *freetree, tx);
+	}
+	metaslab_group_histogram_add(mg, msp);
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
-	 * For sync pass 1, we avoid walking the entire space map and
-	 * instead will just swap the pointers for freemap and
-	 * freed_map. We can safely do this since the freed_map is
+	 * For sync pass 1, we avoid traversing this txg's free range tree
+	 * and instead will just swap the pointers for freetree and
+	 * freed_tree. We can safely do this since the freed_tree is
 	 * guaranteed to be empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
-		ASSERT0((*freed_map)->sm_space);
-		ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
-		space_map_swap(freemap, freed_map);
+		range_tree_swap(freetree, freed_tree);
 	} else {
-		space_map_vacate(*freemap, space_map_add, *freed_map);
+		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
 	}
+	range_tree_vacate(alloctree, NULL, NULL);
 
-	ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
-	ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
+	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	mutex_exit(&msp->ms_lock);
 
-	VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(smo, db->db_data, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
-
+	if (object != space_map_object(msp->ms_sm)) {
+		object = space_map_object(msp->ms_sm);
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &object, tx);
+	}
 	dmu_tx_commit(tx);
 }
 
@@ -1211,13 +2132,10 @@
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
-	space_map_obj_t *smo = &msp->ms_smo;
-	space_map_obj_t *smosync = &msp->ms_smo_syncing;
-	space_map_t *sm = msp->ms_map;
-	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
+	range_tree_t **freed_tree;
+	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 
 	ASSERT(!vd->vdev_ishole);
@@ -1226,63 +2144,63 @@
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
-	 * allocmaps, freemaps, and defermap and add its capacity to the vdev.
+	 * alloctrees, freetrees, and defertree and add its capacity to
+	 * the vdev.
 	 */
-	if (*freed_map == NULL) {
-		ASSERT(*defer_map == NULL);
+	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
-			msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
-			    KM_SLEEP);
-			space_map_create(msp->ms_allocmap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-			msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
-			    KM_SLEEP);
-			space_map_create(msp->ms_freemap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+			ASSERT(msp->ms_alloctree[t] == NULL);
+			ASSERT(msp->ms_freetree[t] == NULL);
+
+			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
+			msp->ms_freetree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
 		}
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
-			    KM_SLEEP);
-			space_map_create(msp->ms_defermap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+			ASSERT(msp->ms_defertree[t] == NULL);
+
+			msp->ms_defertree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
 		}
 
-		freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-		defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
-
-		vdev_space_update(vd, 0, 0, sm->sm_size);
+		vdev_space_update(vd, 0, 0, msp->ms_size);
 	}
 
-	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
-	defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
+	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
+	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
 
+	alloc_delta = space_map_alloc_delta(msp->ms_sm);
+	defer_delta = range_tree_space(*freed_tree) -
+	    range_tree_space(*defer_tree);
+
 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
-	ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
-	ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
+	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	/*
-	 * If there's a space_map_load() in progress, wait for it to complete
+	 * If there's a metaslab_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
 	 */
-	space_map_load_wait(sm);
+	metaslab_load_wait(msp);
 
 	/*
-	 * Move the frees from the defer_map to this map (if it's loaded).
-	 * Swap the freed_map and the defer_map -- this is safe to do
-	 * because we've just emptied out the defer_map.
+	 * Move the frees from the defer_tree back to the free
+	 * range tree (if it's loaded). Swap the freed_tree and the
+	 * defer_tree -- this is safe to do because we've just emptied out
+	 * the defer_tree.
 	 */
-	space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
-	ASSERT0((*defer_map)->sm_space);
-	ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
-	space_map_swap(freed_map, defer_map);
+	range_tree_vacate(*defer_tree,
+	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+	range_tree_swap(freed_tree, defer_tree);
 
-	*smo = *smosync;
+	space_map_update(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
-	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
@@ -1291,24 +2209,17 @@
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 
-	/*
-	 * If the map is loaded but no longer active, evict it as soon as all
-	 * future allocations have synced.  (If we unloaded it now and then
-	 * loaded a moment later, the map wouldn't reflect those allocations.)
-	 */
-	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int evictable = 1;
+	if (msp->ms_loaded && msp->ms_access_txg < txg) {
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+			VERIFY0(range_tree_space(
+			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
+		}
 
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
-				evictable = 0;
-
-		if (evictable && !metaslab_debug)
-			space_map_unload(sm);
+		if (!metaslab_debug_unload)
+			metaslab_unload(msp);
 	}
 
 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
-
 	mutex_exit(&msp->ms_lock);
 }
 
@@ -1315,30 +2226,13 @@
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
-	vdev_t *vd = mg->mg_vd;
-	int64_t failures = mg->mg_alloc_failures;
+	metaslab_group_alloc_update(mg);
+	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
-	 * Re-evaluate all metaslabs which have lower offsets than the
-	 * bonus area.
+	 * Preload the next potential metaslabs
 	 */
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
-		metaslab_t *msp = vd->vdev_ms[m];
-
-		if (msp->ms_map->sm_start > mg->mg_bonus_area)
-			break;
-
-		mutex_enter(&msp->ms_lock);
-		metaslab_group_sort(mg, msp, metaslab_weight(msp));
-		mutex_exit(&msp->ms_lock);
-	}
-
-	atomic_add_64(&mg->mg_alloc_failures, -failures);
-
-	/*
-	 * Prefetch the next potential metaslabs
-	 */
-	metaslab_prefetch(mg);
+	metaslab_group_preload(mg);
 }
 
 static uint64_t
@@ -1346,7 +2240,7 @@
 {
 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-	uint64_t start = msp->ms_map->sm_start >> ms_shift;
+	uint64_t start = msp->ms_id;
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (1ULL << 63);
@@ -1358,9 +2252,58 @@
 	return (0);
 }
 
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+{
+#ifdef ZFS_DEBUG
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+
+	for (int d = 0; d < ndvas; d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+	}
+#endif
+}
+
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
-    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
@@ -1386,11 +2329,10 @@
 			if (msp->ms_weight < asize) {
 				spa_dbgmsg(spa, "%s: failed to meet weight "
 				    "requirement: vdev %llu, txg %llu, mg %p, "
-				    "msp %p, psize %llu, asize %llu, "
-				    "failures %llu, weight %llu",
-				    spa_name(spa), mg->mg_vd->vdev_id, txg,
-				    mg, msp, psize, asize,
-				    mg->mg_alloc_failures, msp->ms_weight);
+				    "msp %p, asize %llu, "
+				    "weight %llu", spa_name(spa),
+				    mg->mg_vd->vdev_id, txg,
+				    mg, msp, asize, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
@@ -1398,7 +2340,7 @@
 			/*
 			 * If the selected metaslab is condensing, skip it.
 			 */
-			if (msp->ms_map->sm_condensing)
+			if (msp->ms_condensing)
 				continue;
 
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
@@ -1406,7 +2348,8 @@
 				break;
 
 			target_distance = min_distance +
-			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+			    min_distance >> 1);
 
 			for (i = 0; i < d; i++)
 				if (metaslab_distance(msp, &dva[i]) <
@@ -1419,25 +2362,6 @@
 		if (msp == NULL)
 			return (-1ULL);
 
-		/*
-		 * If we've already reached the allowable number of failed
-		 * allocation attempts on this metaslab group then we
-		 * consider skipping it. We skip it only if we're allowed
-		 * to "fast" gang, the physical size is larger than
-		 * a gang block, and we're attempting to allocate from
-		 * the primary metaslab.
-		 */
-		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
-		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
-		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
-			spa_dbgmsg(spa, "%s: skipping metaslab group: "
-			    "vdev %llu, txg %llu, mg %p, psize %llu, "
-			    "asize %llu, failures %llu", spa_name(spa),
-			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
-			    mg->mg_alloc_failures);
-			return (-1ULL);
-		}
-
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1471,28 +2395,25 @@
 		 * we can't manipulate this metaslab until it's committed
 		 * to disk.
 		 */
-		if (msp->ms_map->sm_condensing) {
+		if (msp->ms_condensing) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
+		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
 			break;
 
-		atomic_inc_64(&mg->mg_alloc_failures);
-
-		metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
-
+		metaslab_passivate(msp, metaslab_block_maxsize(msp));
 		mutex_exit(&msp->ms_lock);
 	}
 
-	if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
+	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
+	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
+	msp->ms_access_txg = txg + metaslab_unload_delay;
 
 	mutex_exit(&msp->ms_lock);
-
 	return (offset);
 }
 
@@ -1509,7 +2430,6 @@
 	int all_zero;
 	int zio_lock = B_FALSE;
 	boolean_t allocatable;
-	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;
 
@@ -1579,7 +2499,6 @@
 	all_zero = B_TRUE;
 	do {
 		ASSERT(mg->mg_activation_count == 1);
-
 		vd = mg->mg_vd;
 
 		/*
@@ -1592,18 +2511,30 @@
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
+
+		/*
+		 * Determine if the selected metaslab group is eligible
+		 * for allocations. If we're ganging then don't allow
+		 * this metaslab group to skip allocations since that would
+		 * inadvertently return ENOSPC and suspend the pool
+		 * even though space is still available.
+		 */
+		if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
+			allocatable = metaslab_group_allocatable(mg, rotor,
+			    psize);
+		}
+
 		if (!allocatable)
 			goto next;
 
+		ASSERT(mg->mg_initialized);
+
 		/*
-		 * Avoid writing single-copy data to a failing vdev
-		 * unless the user instructs us that it is okay.
+		 * Avoid writing single-copy data to a failing vdev.
 		 */
 		if ((vd->vdev_stat.vs_write_errors > 0 ||
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
-		    d == 0 && dshift == 3 &&
-		    !(zfs_write_to_degraded && vd->vdev_state ==
-		    VDEV_STATE_DEGRADED)) {
+		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
 			all_zero = B_FALSE;
 			goto next;
 		}
@@ -1619,8 +2550,32 @@
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
-		    dva, d, flags);
+		uint64_t offset = metaslab_group_alloc(mg, asize, txg,
+		    distance, dva, d);
+
+		mutex_enter(&mg->mg_lock);
+		if (offset == -1ULL) {
+			mg->mg_failed_allocations++;
+			if (asize == SPA_GANGBLOCKSIZE) {
+				/*
+				 * This metaslab group was unable to allocate
+				 * the minimum gang block size so it must be
+				 * out of space. We must notify the allocation
+				 * throttle to start skipping allocation
+				 * attempts to this metaslab group until more
+				 * space becomes available.
+				 *
+				 * Note: this failure cannot be caused by the
+				 * allocation throttle since the allocation
+				 * throttle is only responsible for skipping
+				 * devices and not failing block allocations.
+				 */
+				mg->mg_no_free_space = B_TRUE;
+			}
+		}
+		mg->mg_allocations++;
+		mutex_exit(&mg->mg_lock);
+
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1628,7 +2583,7 @@
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
-			if (mc->mc_aliquot == 0) {
+			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
@@ -1650,6 +2605,8 @@
 				 */
 				mg->mg_bias = ((cu - vu) *
 				    (int64_t)mg->mg_aliquot) / 100;
+			} else if (!metaslab_bias_enabled) {
+				mg->mg_bias = 0;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1721,13 +2678,22 @@
 	mutex_enter(&msp->ms_lock);
 
 	if (now) {
-		space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
+		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
 		    offset, size);
-		space_map_free(msp->ms_map, offset, size);
+
+		VERIFY(!msp->ms_condensing);
+		VERIFY3U(offset, >=, msp->ms_start);
+		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+		    msp->ms_size);
+		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		range_tree_add(msp->ms_tree, offset, size);
 	} else {
-		if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
+		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
+		    offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1762,10 +2728,10 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-	if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
+	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
@@ -1773,12 +2739,16 @@
 		return (error);
 	}
 
-	space_map_claim(msp->ms_map, offset, size);
+	VERIFY(!msp->ms_condensing);
+	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
+	range_tree_remove(msp->ms_tree, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
+		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1786,9 +2756,57 @@
 	return (0);
 }
 
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
+    int flags)
+{
+	uint64_t available_slots = 0;
+	boolean_t slot_reserved = B_FALSE;
+
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+
+	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
+	if (reserved_slots < mc->mc_alloc_max_slots)
+		available_slots = mc->mc_alloc_max_slots - reserved_slots;
+
+	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+		/*
+		 * We reserve the slots individually so that we can unreserve
+		 * them individually when an I/O completes.
+		 */
+		for (int d = 0; d < slots; d++) {
+			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+		}
+		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+		slot_reserved = B_TRUE;
+	}
+
+	mutex_exit(&mc->mc_lock);
+	return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+{
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+	for (int d = 0; d < slots; d++) {
+		(void) refcount_remove(&mc->mc_alloc_slots, zio);
+	}
+	mutex_exit(&mc->mc_lock);
+}
+
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
-    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
+    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@@ -1811,14 +2829,24 @@
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags);
-		if (error) {
+		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+				metaslab_group_alloc_decrement(spa,
+				    DVA_GET_VDEV(&dva[d]), zio, flags);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
+		} else {
+			/*
+			 * Update the metaslab group's queue depth
+			 * based on the newly allocated dva.
+			 */
+			metaslab_group_alloc_increment(spa,
+			    DVA_GET_VDEV(&dva[d]), zio, flags);
 		}
+
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
@@ -1878,19 +2906,6 @@
 	return (error);
 }
 
-static void
-checkmap(space_map_t *sm, uint64_t off, uint64_t size)
-{
-	space_seg_t *ss;
-	avl_index_t where;
-
-	mutex_enter(sm->sm_lock);
-	ss = space_map_find(sm, off, size, &where);
-	if (ss != NULL)
-		panic("freeing free block; ss=%p", (void *)ss);
-	mutex_exit(sm->sm_lock);
-}
-
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
@@ -1899,19 +2914,19 @@
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
-		uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
-		vdev_t *vd = vdev_lookup_top(spa, vdid);
-		uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+		vdev_t *vd = vdev_lookup_top(spa, vdev);
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
-		metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
+		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
-		if (ms->ms_map->sm_loaded)
-			checkmap(ms->ms_map, off, size);
+		if (msp->ms_loaded)
+			range_tree_verify(msp->ms_tree, offset, size);
 
 		for (int j = 0; j < TXG_SIZE; j++)
-			checkmap(ms->ms_freemap[j], off, size);
+			range_tree_verify(msp->ms_freetree[j], offset, size);
 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
-			checkmap(ms->ms_defermap[j], off, size);
+			range_tree_verify(msp->ms_defertree[j], offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -31,6 +31,11 @@
 
 #ifdef _KERNEL
 int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.reference_tracking_enable", &reference_tracking_enable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+    &reference_tracking_enable, 0,
+    "Track reference holders to refcount_t objects, used mostly by ZFS");
 #else
 int reference_tracking_enable = TRUE;
 #endif
@@ -70,6 +75,13 @@
 }
 
 void
+refcount_create_tracked(refcount_t *rc)
+{
+	refcount_create(rc);
+	rc->rc_tracked = B_TRUE;
+}
+
+void
 refcount_create_untracked(refcount_t *rc)
 {
 	refcount_create(rc);
@@ -228,4 +240,84 @@
 	list_destroy(&removed);
 }
 
+void
+refcount_transfer_ownership(refcount_t *rc, void *current_holder,
+    void *new_holder)
+{
+	reference_t *ref;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&rc->rc_mtx);
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return;
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == current_holder) {
+			ref->ref_holder = new_holder;
+			found = B_TRUE;
+			break;
+		}
+	}
+	ASSERT(found);
+	mutex_exit(&rc->rc_mtx);
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+refcount_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (rc->rc_count > 0);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_TRUE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+refcount_not_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (B_TRUE);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_TRUE);
+}
 #endif	/* ZFS_DEBUG */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -160,8 +160,8 @@
 	refcount_destroy(&rrl->rr_linked_rcount);
 }
 
-void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
 #if !defined(DEBUG) && defined(_KERNEL)
@@ -177,7 +177,7 @@
 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
 	while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
-	    refcount_is_zero(&rrl->rr_anon_rcount) &&
+	    refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
 	    rrn_find(rrl) == NULL))
 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
 
@@ -193,6 +193,25 @@
 }
 
 void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
 rrw_enter_write(rrwlock_t *rrl)
 {
 	mutex_enter(&rrl->rr_lock);
@@ -287,3 +306,91 @@
 		    (void *)curthread, (void *)rn->rn_rrl);
 	}
 }
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function.  That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrm_enter_read(rrl, tag);
+	else
+		rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock.  Note that the lock
+ * must be released by the same thread that acquired it.  We do this
+ * mapping by taking the thread pointer mod a prime number.  We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+	int i;
+
+	if (rrl->locks[0].rr_writer == curthread) {
+		for (i = 0; i < RRM_NUM_LOCKS; i++)
+			rrw_exit(&rrl->locks[i], tag);
+	} else {
+		rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+	}
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+	if (rw == RW_WRITER) {
+		return (rrw_held(&rrl->locks[0], rw));
+	} else {
+		return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+	}
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,6 +24,8 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -211,12 +213,6 @@
 {
 	sa_handle_t *hdl = buf;
 
-	hdl->sa_bonus_tab = NULL;
-	hdl->sa_spill_tab = NULL;
-	hdl->sa_os = NULL;
-	hdl->sa_userp = NULL;
-	hdl->sa_bonus = NULL;
-	hdl->sa_spill = NULL;
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
@@ -501,7 +497,7 @@
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
-	} else if (size > SPA_MAXBLOCKSIZE) {
+	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (SET_ERROR(EFBIG));
 	} else {
@@ -553,10 +549,9 @@
 {
 	int var_size = 0;
 	int i;
-	int j = -1;
 	int full_space;
 	int hdrsize;
-	boolean_t done = B_FALSE;
+	int extra_hdrsize;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
@@ -567,10 +562,9 @@
 
 	*index = -1;
 	*total = 0;
+	*will_spill = B_FALSE;
 
-	if (buftype == SA_BONUS)
-		*will_spill = B_FALSE;
-
+	extra_hdrsize = 0;
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
@@ -582,8 +576,8 @@
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
-		if (done)
-			goto next;
+		if (*will_spill)
+			continue;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz) {
@@ -591,21 +585,28 @@
 		}
 
 		if (is_var_sz && var_size > 1) {
-			if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+			/*
+			 * Don't worry that the spill block might overflow.
+			 * It will be resized if needed in sa_build_layouts().
+			 */
+			if (buftype == SA_SPILL ||
+			    P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
 			    *total < full_space) {
 				/*
 				 * Account for header space used by array of
 				 * optional sizes of variable-length attributes.
-				 * Record the index in case this increase needs
-				 * to be reversed due to spill-over.
+				 * Record the extra header size in case this
+				 * increase needs to be reversed due to
+				 * spill-over.
 				 */
 				hdrsize += sizeof (uint16_t);
-				j = i;
+				if (*index != -1)
+					extra_hdrsize += sizeof (uint16_t);
 			} else {
-				done = B_TRUE;
-				*index = i;
-				if (buftype == SA_BONUS)
-					*will_spill = B_TRUE;
+				ASSERT(buftype == SA_BONUS);
+				if (*index == -1)
+					*index = i;
+				*will_spill = B_TRUE;
 				continue;
 			}
 		}
@@ -620,22 +621,15 @@
 		    (*total + P2ROUNDUP(hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t))) {
 			*index = i;
-			done = B_TRUE;
 		}
 
-next:
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
-	/*
-	 * j holds the index of the last variable-sized attribute for
-	 * which hdrsize was increased.  Reverse the increase if that
-	 * attribute will be relocated to the spill block.
-	 */
-	if (*will_spill && j == *index)
-		hdrsize -= sizeof (uint16_t);
+	if (*will_spill)
+		hdrsize -= extra_hdrsize;
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
@@ -676,7 +670,7 @@
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, &i, &used, &spilling);
 
-	if (used > SPA_MAXBLOCKSIZE)
+	if (used > SPA_OLD_MAXBLOCKSIZE)
 		return (SET_ERROR(EFBIG));
 
 	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -700,7 +694,7 @@
 		    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
 		    &spill_used, &dummy);
 
-		if (spill_used > SPA_MAXBLOCKSIZE)
+		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
 			return (SET_ERROR(EFBIG));
 
 		buf_space = hdl->sa_spill->db_size - spillhdrsize;
@@ -1112,6 +1106,9 @@
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
+	avl_destroy(&sa->sa_layout_hash_tree);
+	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
@@ -1147,6 +1144,7 @@
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
@@ -1301,10 +1299,10 @@
 }
 
 /*ARGSUSED*/
-void
-sa_evict(dmu_buf_t *db, void *sap)
+static void
+sa_evict(void *dbu)
 {
-	panic("evicting sa dbuf %p\n", (void *)db);
+	panic("evicting sa dbuf\n");
 }
 
 static void
@@ -1343,18 +1341,16 @@
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
+	dmu_buf_t *db = hdl->sa_bonus;
+
 	mutex_enter(&hdl->sa_lock);
-	(void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
-	    NULL, NULL, NULL);
+	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
 
-	if (hdl->sa_bonus_tab) {
+	if (hdl->sa_bonus_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
-		hdl->sa_bonus_tab = NULL;
-	}
-	if (hdl->sa_spill_tab) {
+
+	if (hdl->sa_spill_tab)
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
-		hdl->sa_spill_tab = NULL;
-	}
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
@@ -1371,7 +1367,7 @@
 {
 	int error = 0;
 	dmu_object_info_t doi;
-	sa_handle_t *handle;
+	sa_handle_t *handle = NULL;
 
 #ifdef ZFS_DEBUG
 	dmu_object_info_from_db(db, &doi);
@@ -1381,23 +1377,31 @@
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
-	handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+	if (hdl_type == SA_HDL_SHARED)
+		handle = dmu_buf_get_user(db);
+
 	if (handle == NULL) {
-		sa_handle_t *newhandle;
+		sa_handle_t *winner = NULL;
+
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+		handle->sa_dbu.dbu_evict_func = NULL;
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
+		handle->sa_bonus_tab = NULL;
+		handle->sa_spill_tab = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
-		newhandle = (hdl_type == SA_HDL_SHARED) ?
-		    dmu_buf_set_user_ie(db, handle,
-		    NULL, sa_evict) : NULL;
 
-		if (newhandle != NULL) {
+		if (hdl_type == SA_HDL_SHARED) {
+			dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL);
+			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+		}
+
+		if (winner != NULL) {
 			kmem_cache_free(sa_cache, handle);
-			handle = newhandle;
+			handle = winner;
 		}
 	}
 	*handlepp = handle;
@@ -1650,7 +1654,7 @@
 	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
-	uint16_t length;
+	uint16_t length, reg_length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
@@ -1710,34 +1714,50 @@
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
 	for (; k != 2; k++) {
-		/* iterate over each attribute in layout */
+		/*
+		 * Iterate over each attribute in layout.  Fetch the
+		 * size of variable-length attributes needing rewrite
+		 * from sa_lengths[].
+		 */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
+			reg_length = SA_REGISTERED_LEN(sa, attr);
+			if (reg_length == 0) {
+				length = hdr->sa_lengths[length_idx];
+				length_idx++;
+			} else {
+				length = reg_length;
+			}
 			if (attr == newattr) {
-				/* duplicate attributes are not allowed */
-				ASSERT(action == SA_REPLACE ||
-				    action == SA_REMOVE);
-				/* must be variable-sized to be replaced here */
-				if (action == SA_REPLACE) {
-					ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
-					SA_ADD_BULK_ATTR(attr_desc, j, attr,
-					    locator, datastart, buflen);
-				}
-			} else {
-				length = SA_REGISTERED_LEN(sa, attr);
-				if (length == 0) {
-					length = hdr->sa_lengths[length_idx];
-				}
+				/*
+				 * There is nothing to do for SA_REMOVE,
+				 * so it is just skipped.
+				 */
+				if (action == SA_REMOVE)
+					continue;
 
+				/*
+				 * Duplicate attributes are not allowed, so the
+				 * action can not be SA_ADD here.
+				 */
+				ASSERT3S(action, ==, SA_REPLACE);
+
+				/*
+				 * Only a variable-sized attribute can be
+				 * replaced here, and its size must be changing.
+				 */
+				ASSERT3U(reg_length, ==, 0);
+				ASSERT3U(length, !=, buflen);
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
+				    locator, datastart, buflen);
+			} else {
+				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
-			if (SA_REGISTERED_LEN(sa, attr) == 0)
-				length_idx++;
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
@@ -1748,10 +1768,8 @@
 		}
 	}
 	if (action == SA_ADD) {
-		length = SA_REGISTERED_LEN(sa, newattr);
-		if (length == 0) {
-			length = buflen;
-		}
+		reg_length = SA_REGISTERED_LEN(sa, newattr);
+		IMPLY(reg_length != 0, reg_length == buflen);
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
@@ -1915,14 +1933,6 @@
 }
 
 void
-sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
-{
-	(void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
-	    oldhdl, newhdl, NULL, sa_evict);
-	oldhdl->sa_bonus = NULL;
-}
-
-void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,16 +23,21 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #ifdef _KERNEL
-#include <crypto/sha2/sha2.h>
+#include <crypto/sha2/sha256.h>
 #else
 #include <sha256.h>
 #endif
 
+/*ARGSUSED*/
 void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_SHA256(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	SHA256_CTX ctx;
 	zio_cksum_t tmp;
@@ -53,3 +58,31 @@
 	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
 	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
+
+#ifdef illumos
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	SHA2_CTX	ctx;
+
+	SHA2Init(SHA512_256, &ctx);
+	SHA2Update(&ctx, buf, size);
+	SHA2Final(zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+#endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,12 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
@@ -84,11 +87,6 @@
 /* Check hostid on import? */
 static int check_hostid = 1;
 
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
-    "Check hostid on import?");
-
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
@@ -95,24 +93,33 @@
  */
 static int zfs_ccw_retry_interval = 300;
 
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
+    "Check hostid on import?");
+TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
+    &zfs_ccw_retry_interval, 0,
+    "Configuration cache file write, retry after failure, interval (seconds)");
+
 typedef enum zti_modes {
-	zti_mode_fixed,			/* value is # of threads (min 1) */
-	zti_mode_online_percent,	/* value is % of online CPUs */
-	zti_mode_batch,			/* cpu-intensive; value is ignored */
-	zti_mode_null,			/* don't create a taskq */
-	zti_nmodes
+	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
+	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
+	ZTI_MODE_NULL,			/* don't create a taskq */
+	ZTI_NMODES
 } zti_modes_t;
 
-#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
-#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
-#define	ZTI_BATCH	{ zti_mode_batch, 0 }
-#define	ZTI_NULL	{ zti_mode_null, 0 }
+#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
+#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
+#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
-#define	ZTI_ONE		ZTI_FIX(1)
+#define	ZTI_N(n)	ZTI_P(n, 1)
+#define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
-	enum zti_modes zti_mode;
+	zti_modes_t zti_mode;
 	uint_t zti_value;
+	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
@@ -120,19 +127,34 @@
 };
 
 /*
- * Define the taskq threads for the following I/O types:
- * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
-	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
-	{ ZTI_FIX(100),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
+	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
+	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
+	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
+static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name);
+static void spa_event_post(sysevent_t *ev);
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
@@ -141,22 +163,18 @@
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
-uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
+uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
 #ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
 #endif
 #ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
+uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 #endif
-uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
-#ifndef illumos
-extern void spa_deadman(void *arg);
-#endif
-
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
@@ -199,12 +217,10 @@
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
-	uint64_t size;
-	uint64_t alloc;
-	uint64_t space;
-	uint64_t cap, version;
+	uint64_t size, alloc, cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
+	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
@@ -217,14 +233,10 @@
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
-		space = 0;
-		for (int c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-			space += tvd->vdev_max_asize - tvd->vdev_asize;
-		}
-		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
-		    src);
-
+		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+		    metaslab_class_fragmentation(mc), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == FREAD), src);
 
@@ -246,19 +258,27 @@
 	}
 
 	if (pool != NULL) {
-		dsl_dir_t *freedir = pool->dp_free_dir;
-
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
-		if (freedir != NULL) {
+		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-			    freedir->dd_phys->dd_used_bytes, src);
+			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
+
+		if (pool->dp_leak_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+			    src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+			    NULL, 0, src);
+		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
@@ -272,6 +292,14 @@
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+	} else {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+	}
+
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -342,8 +370,7 @@
 					break;
 				}
 
-				strval = kmem_alloc(
-				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
@@ -356,8 +383,7 @@
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
-				kmem_free(strval,
-				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
@@ -486,7 +512,7 @@
 
 			if (!error) {
 				objset_t *os;
-				uint64_t compress;
+				uint64_t propval;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
@@ -497,7 +523,11 @@
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
-				/* Must be ZPL and not gzip compressed. */
+				/*
+				 * Must be ZPL, and its property settings
+				 * must be supported by GRUB (compression
+				 * is not gzip, and large blocks are not used).
+				 */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
@@ -504,9 +534,15 @@
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-				    &compress)) == 0 &&
-				    !BOOTFS_COMPRESS_VALID(compress)) {
+				    &propval)) == 0 &&
+				    !BOOTFS_COMPRESS_VALID(propval)) {
 					error = SET_ERROR(ENOTSUP);
+				} else if ((error =
+				    dsl_prop_get_int_ds(dmu_objset_ds(os),
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    &propval)) == 0 &&
+				    propval > SPA_OLD_MAXBLOCKSIZE) {
+					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
@@ -573,7 +609,6 @@
 					error = SET_ERROR(EINVAL);
 					break;
 				}
-				check++;
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = E2BIG;
@@ -672,7 +707,8 @@
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
-			    spa_sync_version, &ver, 6);
+			    spa_sync_version, &ver,
+			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
@@ -684,7 +720,7 @@
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
-		    nvp, 6));
+		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
@@ -760,11 +796,12 @@
 	int error;
 	uint64_t guid;
 
+	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
-	    spa_change_guid_sync, &guid, 5);
+	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
@@ -772,6 +809,7 @@
 	}
 
 	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
@@ -790,7 +828,7 @@
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
-	    sizeof (zbookmark_t));
+	    sizeof (zbookmark_phys_t));
 
 	if (ret < 0)
 		return (-1);
@@ -820,50 +858,141 @@
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
-static taskq_t *
-spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
-    uint_t value)
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
-	uint_t flags = TASKQ_PREPOPULATE;
+	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+	enum zti_modes mode = ztip->zti_mode;
+	uint_t value = ztip->zti_value;
+	uint_t count = ztip->zti_count;
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	char name[32];
+	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
+	if (mode == ZTI_MODE_NULL) {
+		tqs->stqs_count = 0;
+		tqs->stqs_taskq = NULL;
+		return;
+	}
+
+	ASSERT3U(count, >, 0);
+
+	tqs->stqs_count = count;
+	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
 	switch (mode) {
-	case zti_mode_null:
-		return (NULL);		/* no taskq needed */
-
-	case zti_mode_fixed:
+	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
-	case zti_mode_batch:
+	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
-	case zti_mode_online_percent:
-		flags |= TASKQ_THREADS_CPU_PCT;
-		break;
-
 	default:
-		panic("unrecognized mode for %s taskq (%u:%u) in "
+		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
-		    name, mode, value);
+		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
+	for (uint_t i = 0; i < count; i++) {
+		taskq_t *tq;
+
+		if (count > 1) {
+			(void) snprintf(name, sizeof (name), "%s_%s_%u",
+			    zio_type_name[t], zio_taskq_types[q], i);
+		} else {
+			(void) snprintf(name, sizeof (name), "%s_%s",
+			    zio_type_name[t], zio_taskq_types[q]);
+		}
+
 #ifdef SYSDC
-	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-		if (batch)
-			flags |= TASKQ_DC_BATCH;
+		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+			if (batch)
+				flags |= TASKQ_DC_BATCH;
 
-		return (taskq_create_sysdc(name, value, 50, INT_MAX,
-		    spa->spa_proc, zio_taskq_basedc, flags));
+			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+			    spa->spa_proc, zio_taskq_basedc, flags);
+		} else {
+#endif
+			pri_t pri = maxclsyspri;
+			/*
+			 * The write issue taskq can be extremely CPU
+			 * intensive.  Run it at slightly lower priority
+			 * than the other taskqs.
+			 * FreeBSD notes:
+			 * - numerically higher priorities are lower priorities;
+			 * - if priorities divided by four (RQ_PPQ) are equal
+			 *   then a difference between them is insignificant.
+			 */
+			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+#ifdef illumos
+				pri--;
+#else
+				pri += 4;
+#endif
+
+			tq = taskq_create_proc(name, value, pri, 50,
+			    INT_MAX, spa->spa_proc, flags);
+#ifdef SYSDC
+		}
+#endif
+
+		tqs->stqs_taskq[i] = tq;
 	}
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+	if (tqs->stqs_taskq == NULL) {
+		ASSERT0(tqs->stqs_count);
+		return;
+	}
+
+	for (uint_t i = 0; i < tqs->stqs_count; i++) {
+		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+		taskq_destroy(tqs->stqs_taskq[i]);
+	}
+
+	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+	tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	taskq_t *tq;
+
+	ASSERT3P(tqs->stqs_taskq, !=, NULL);
+	ASSERT3U(tqs->stqs_count, !=, 0);
+
+	if (tqs->stqs_count == 1) {
+		tq = tqs->stqs_taskq[0];
+	} else {
+#ifdef _KERNEL
+		tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
+#else
+		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
 #endif
-	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
-	    spa->spa_proc, flags));
+	}
+
+	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 static void
@@ -871,16 +1000,7 @@
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
-			enum zti_modes mode = ztip->zti_mode;
-			uint_t value = ztip->zti_value;
-			char name[32];
-
-			(void) snprintf(name, sizeof (name),
-			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
-
-			spa->spa_zio_taskq[t][q] =
-			    spa_taskq_create(spa, name, mode, value);
+			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
@@ -1018,6 +1138,8 @@
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
+	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
@@ -1050,16 +1172,17 @@
 	 */
 	trim_thread_destroy(spa);
 
+	spa_evicting_os_wait(spa);
+
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
+	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			if (spa->spa_zio_taskq[t][q] != NULL)
-				taskq_destroy(spa->spa_zio_taskq[t][q]);
-			spa->spa_zio_taskq[t][q] = NULL;
+			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
@@ -1187,13 +1310,24 @@
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
-		(void) zio_wait(spa->spa_async_zio_root);
+		for (int i = 0; i < max_ncpus; i++)
+			(void) zio_wait(spa->spa_async_zio_root[i]);
+		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
 	/*
+	 * Close all vdevs.
+	 */
+	if (spa->spa_root_vdev)
+		vdev_free(spa->spa_root_vdev);
+	ASSERT(spa->spa_root_vdev == NULL);
+
+	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
@@ -1204,20 +1338,11 @@
 
 	ddt_unload(spa);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
-	/*
-	 * Close all vdevs.
-	 */
-	if (spa->spa_root_vdev)
-		vdev_free(spa->spa_root_vdev);
-	ASSERT(spa->spa_root_vdev == NULL);
-
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
@@ -1511,7 +1636,10 @@
 	int error;
 	*value = NULL;
 
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+	if (error != 0)
+		return (error);
+
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
@@ -1671,13 +1799,14 @@
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
+	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
-		rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
-		    NULL, DS_FIND_CHILDREN) != 0);
+		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
@@ -1778,35 +1907,88 @@
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
+	spa_t *spa = zio->io_spa;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
-			atomic_add_64(&sle->sle_meta_count, 1);
+			atomic_inc_64(&sle->sle_meta_count);
 		else
-			atomic_add_64(&sle->sle_data_count, 1);
+			atomic_inc_64(&sle->sle_data_count);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+    &spa_load_verify_maxinflight, 0,
+    "Maximum number of concurrent scrub I/Os to create while verifying a "
+    "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+    &spa_load_verify_metadata, 0,
+    "Check metadata on import?");
+ 
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+    &spa_load_verify_data, 0,
+    "Check user data on import?");
+ 
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	if (bp != NULL) {
-		zio_t *rio = arg;
-		size_t size = BP_GET_PSIZE(bp);
-		void *data = zio_data_buf_alloc(size);
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+	/*
+	 * Note: normally this routine will not be called if
+	 * spa_load_verify_metadata is not set.  However, it may be useful
+	 * to manually set the flag after the traversal has begun.
+	 */
+	if (!spa_load_verify_metadata)
+		return (0);
+	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+		return (0);
 
-		zio_nowait(zio_read(rio, spa, bp, data, size,
-		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
-	}
+	zio_t *rio = arg;
+	size_t size = BP_GET_PSIZE(bp);
+	void *data = zio_data_buf_alloc(size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(rio, spa, bp, data, size,
+	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
+/* ARGSUSED */
+int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	return (0);
+}
+
 static int
 spa_load_verify(spa_t *spa)
 {
@@ -1814,7 +1996,7 @@
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
-	int error;
+	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
@@ -1821,11 +2003,22 @@
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+	error = dmu_objset_find_dp(spa->spa_dsl_pool,
+	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+	    DS_FIND_CHILDREN);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+	if (error != 0)
+		return (error);
+
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
-	error = traverse_pool(spa, spa->spa_verify_min_txg,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+	if (spa_load_verify_metadata) {
+		error = traverse_pool(spa, spa->spa_verify_min_txg,
+		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+		    spa_load_verify_cb, rio);
+	}
 
 	(void) zio_wait(rio);
 
@@ -2018,6 +2211,11 @@
 		    mosconfig, &ereport);
 	}
 
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
@@ -2074,8 +2272,13 @@
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
-	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
@@ -2090,6 +2293,8 @@
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
+	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
@@ -2267,14 +2472,12 @@
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
-		if (!feature_is_supported(spa->spa_meta_objset,
-		    spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
+		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
-			if (!feature_is_supported(spa->spa_meta_objset,
-			    spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
+			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				missing_feat_write = B_TRUE;
 			}
@@ -2320,8 +2523,34 @@
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
+
+		/*
+		 * Load refcounts for ZFS features from disk into an in-memory
+		 * cache during SPA initialization.
+		 */
+		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+			uint64_t refcount;
+
+			error = feature_get_refcount_from_disk(spa,
+			    &spa_feature_table[i], &refcount);
+			if (error == 0) {
+				spa->spa_feat_refcount_cache[i] = refcount;
+			} else if (error == ENOTSUP) {
+				spa->spa_feat_refcount_cache[i] =
+				    SPA_FEATURE_DISABLED;
+			} else {
+				return (spa_vdev_err(rvd,
+				    VDEV_AUX_CORRUPT_DATA, EIO));
+			}
+		}
 	}
 
+	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+		    &spa->spa_feat_enabled_txg_obj) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
@@ -2377,6 +2606,19 @@
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
+	/* Grab the secret checksum salt from the MOS. */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT, 1,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes),
+	    spa->spa_cksum_salt.zcs_bytes);
+	if (error == ENOENT) {
+		/* Generate a new salt for subsequent use */
+		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes));
+	} else if (error != 0) {
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
@@ -2552,7 +2794,7 @@
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 
-		if (spa_check_logs(spa)) {
+		if (spa_writeable(spa) && spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
@@ -2583,6 +2825,7 @@
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
+		dsl_pool_t *dp = spa_get_dsl(spa);
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
@@ -2595,9 +2838,8 @@
 		 */
 		spa->spa_claiming = B_TRUE;
 
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
-		    spa_first_txg(spa));
-		(void) dmu_objset_find(spa_name(spa),
+		tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+		(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
@@ -2676,7 +2918,7 @@
 	spa_unload(spa);
 	spa_deactivate(spa);
 
-	spa->spa_load_max_txg--;
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
@@ -2706,6 +2948,8 @@
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
+		if (max_request != UINT64_MAX)
+			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
@@ -3061,6 +3305,10 @@
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
+	/* We may be unable to read features if pool is suspended. */
+	if (spa_suspended(spa))
+		goto out;
+
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
@@ -3087,6 +3335,7 @@
 		zap_cursor_fini(&zc);
 	}
 
+out:
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features) == 0);
 	nvlist_free(features);
@@ -3351,12 +3600,18 @@
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
+	char *poolname;
+	nvlist_t *nvl;
 
+	if (nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
+		poolname = (char *)pool;
+
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
+	if (spa_lookup(poolname) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
@@ -3364,9 +3619,12 @@
 	/*
 	 * Allocate a new spa_t structure.
 	 */
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, NULL, altroot);
+	spa = spa_add(poolname, nvl, altroot);
+	fnvlist_free(nvl);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
@@ -3376,6 +3634,12 @@
 		return (error);
 	}
 
+	/*
+	 * Temporary pool names should never be written to disk.
+	 */
+	if (poolname != pool)
+		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
+
 	has_features = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
@@ -3393,12 +3657,18 @@
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_load_state = SPA_LOAD_CREATE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
-	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
 
 	/*
 	 * Create the root vdev.
@@ -3418,6 +3688,7 @@
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_ashift_optimize(rvd->vdev_child[c]);
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
@@ -3532,6 +3803,12 @@
 		spa_history_create_obj(spa, tx);
 
 	/*
+	 * Generate some random noise for salted checksums to operate on.
+	 */
+	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
@@ -3556,10 +3833,17 @@
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
 
 	spa_history_log_version(spa, "create");
 
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
+	spa->spa_load_state = SPA_LOAD_NONE;
 
 	mutex_exit(&spa_namespace_lock);
 
@@ -3567,7 +3851,7 @@
 }
 
 #ifdef _KERNEL
-#if defined(sun)
+#ifdef illumos
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
@@ -3768,7 +4052,7 @@
 	return (error);
 }
 
-#else
+#else	/* !illumos */
 
 extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
     uint64_t *count);
@@ -3911,6 +4195,16 @@
 
 		if ((spa = spa_lookup(pname)) != NULL) {
 			/*
+			 * The pool could already be imported,
+			 * e.g., after reboot -r.
+			 */
+			if (spa->spa_state == POOL_STATE_ACTIVE) {
+				mutex_exit(&spa_namespace_lock);
+				nvlist_free(config);
+				return (0);
+			}
+
+			/*
 			 * Remove the existing root pool from the namespace so
 			 * that we can replace it with the correct config
 			 * we just read in.
@@ -3927,6 +4221,8 @@
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
 		return (EIO);
@@ -3962,8 +4258,8 @@
 	return (0);
 }
 
-#endif	/* sun */
-#endif
+#endif	/* illumos */
+#endif	/* _KERNEL */
 
 /*
  * Import a non-root pool into the system.
@@ -4012,10 +4308,9 @@
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
 
 		mutex_exit(&spa_namespace_lock);
-		spa_history_log_version(spa, "import");
-
 		return (0);
 	}
 
@@ -4144,9 +4439,12 @@
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
-	mutex_exit(&spa_namespace_lock);
 	spa_history_log_version(spa, "import");
 
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+
+	mutex_exit(&spa_namespace_lock);
+
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zvol_create_minors(pool);
@@ -4294,6 +4592,7 @@
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
+		spa_evicting_os_wait(spa);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
@@ -4491,6 +4790,7 @@
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -4627,7 +4927,7 @@
 	}
 
 	/* mark the device being resilvered */
-	newvd->vdev_resilvering = B_TRUE;
+	newvd->vdev_resilver_txg = txg;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
@@ -4679,10 +4979,17 @@
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
-	 * Restart the resilver
+	 * Schedule the resilver to restart in the future. We do this to
+	 * ensure that dmu_sync-ed blocks have been stitched into the
+	 * respective datasets.
 	 */
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
+	if (spa->spa_bootfs)
+		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+	spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
+
 	/*
 	 * Commit the config
 	 */
@@ -4697,9 +5004,6 @@
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
-	if (spa->spa_bootfs)
-		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
-
 	return (0);
 }
 
@@ -4857,7 +5161,6 @@
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
-		cvd->vdev_resilvering = B_FALSE;
 	}
 
 
@@ -5132,13 +5435,13 @@
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
-#ifndef sun
+#ifndef illumos
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
 #endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
-#ifndef sun
+#ifndef illumos
 	newspa->spa_splitting_newspa = B_FALSE;
 #endif
 	if (error)
@@ -5250,7 +5553,7 @@
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
-	nvlist_t *dev_to_remove)
+    nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
@@ -5309,7 +5612,7 @@
 	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
-	vdev_dirty(vd, 0, NULL, txg);
+	vdev_dirty_leaves(vd, VDD_DTL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
@@ -5375,6 +5678,7 @@
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
+	sysevent_t *ev = NULL;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
@@ -5398,6 +5702,9 @@
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
+			if (vd == NULL)
+				vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+			ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
@@ -5412,6 +5719,8 @@
 		/*
 		 * Cache devices can always be removed.
 		 */
+		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
@@ -5420,11 +5729,6 @@
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
-		/*
-		 * XXX - Once we have bp-rewrite this should
-		 * become the common case.
-		 */
-
 		mg = vd->vdev_mg;
 
 		/*
@@ -5457,6 +5761,7 @@
 		/*
 		 * Clean up the vdev namespace.
 		 */
+		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
@@ -5472,8 +5777,11 @@
 	}
 
 	if (!locked)
-		return (spa_vdev_exit(spa, NULL, txg, error));
+		error = spa_vdev_exit(spa, NULL, txg, error);
 
+	if (ev)
+		spa_event_post(ev);
+
 	return (error);
 }
 
@@ -5581,6 +5889,8 @@
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
+		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
@@ -5705,6 +6015,8 @@
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
+		/* Tell userspace that the vdev is gone. */
+		zfs_post_remove(spa, vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
@@ -5764,7 +6076,7 @@
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
-	spa->spa_async_tasks = 0;
+	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
@@ -5790,19 +6102,6 @@
 		}
 	}
 
-	/*
-	 * See if any devices need to be marked REMOVED.
-	 */
-	if (tasks & SPA_ASYNC_REMOVE) {
-		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_remove(spa, spa->spa_root_vdev);
-		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
-			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
-		for (int i = 0; i < spa->spa_spares.sav_count; i++)
-			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-	}
-
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
@@ -5840,12 +6139,51 @@
 	thread_exit();
 }
 
+static void
+spa_async_thread_vd(void *arg)
+{
+	spa_t *spa = arg;
+	int tasks;
+
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+retry:
+	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if any devices need to be marked REMOVED.
+	 */
+	if (tasks & SPA_ASYNC_REMOVE) {
+		spa_vdev_state_enter(spa, SCL_NONE);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	if ((tasks & SPA_ASYNC_REMOVE) != 0)
+		goto retry;
+	spa->spa_async_thread_vd = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
-	while (spa->spa_async_thread != NULL)
+	while (spa->spa_async_thread != NULL &&
+	    spa->spa_async_thread_vd != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
@@ -5866,7 +6204,8 @@
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
-	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
+	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
+	    SPA_ASYNC_REMOVE);
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
@@ -5892,6 +6231,19 @@
 	mutex_exit(&spa->spa_async_lock);
 }
 
+static void
+spa_async_dispatch_vd(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
+	    !spa->spa_async_suspended &&
+	    spa->spa_async_thread_vd == NULL &&
+	    rootdir != NULL)
+		spa->spa_async_thread_vd = thread_create(NULL, 0,
+		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
 void
 spa_async_request(spa_t *spa, int task)
 {
@@ -5899,6 +6251,7 @@
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
+	spa_async_dispatch_vd(spa);
 }
 
 /*
@@ -5925,7 +6278,33 @@
 	return (0);
 }
 
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
 static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+	VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+	    spa_free_sync_cb, zio, tx), ==, 0);
+	VERIFY0(zio_wait(zio));
+}
+
+
+static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
@@ -5937,7 +6316,7 @@
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
-	 * information.  This avoids the dbuf_will_dirty() path and
+	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
@@ -6026,8 +6405,7 @@
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	if (spa->spa_config_syncing)
-		nvlist_free(spa->spa_config_syncing);
+	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
@@ -6072,7 +6450,7 @@
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
-		zfeature_info_t *feature;
+		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPROP_INVAL:
@@ -6082,15 +6460,15 @@
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
-			VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
+			VERIFY0(zfeature_lookup_name(fname, &fid));
 
-			spa_feature_enable(spa, feature, tx);
+			spa_feature_enable(spa, fid, tx);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=enabled", nvpair_name(elem));
 			break;
 
 		case ZPOOL_PROP_VERSION:
-			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced seperatly before other
 			 * properties and should be correct by now.
@@ -6114,7 +6492,7 @@
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
+			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
@@ -6146,23 +6524,23 @@
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
-				VERIFY(nvpair_value_string(elem, &strval) == 0);
-				VERIFY(zap_update(mos,
+				strval = fnvpair_value_string(elem);
+				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
-				    1, strlen(strval) + 1, strval, tx) == 0);
+				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
-				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
-					VERIFY(zpool_prop_index_to_string(
-					    prop, intval, &unused) == 0);
+					VERIFY0(zpool_prop_index_to_string(
+					    prop, intval, &unused));
 				}
-				VERIFY(zap_update(mos,
+				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
-				    8, 1, &intval, tx) == 0);
+				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", nvpair_name(elem), intval);
 			} else {
@@ -6239,6 +6617,36 @@
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
+
+	/*
+	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+	 * when possibility to use lz4 compression for metadata was added
+	 * Old pools that have this feature enabled must be upgraded to have
+	 * this feature active
+	 */
+	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		boolean_t lz4_en = spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+		boolean_t lz4_ac = spa_feature_is_active(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+
+		if (lz4_en && !lz4_ac)
+			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+	}
+
+	/*
+	 * If we haven't written the salt, do so now.  Note that the
+	 * feature may not be activated yet, but that's fine since
+	 * the presence of this ZAP entry is backwards compatible.
+	 */
+	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes),
+		    spa->spa_cksum_salt.zcs_bytes, tx));
+	}
+
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
@@ -6251,12 +6659,13 @@
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
-	bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
+	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+	    zfs_vdev_queue_depth_pct / 100;
 
 	VERIFY(spa_writeable(spa));
 
@@ -6268,6 +6677,10 @@
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
@@ -6298,12 +6711,12 @@
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
 	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
-#else	/* FreeBSD */
+#else	/* !illumos */
 #ifdef _KERNEL
-	callout_reset(&spa->spa_deadman_cycid,
-	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
+	callout_schedule(&spa->spa_deadman_cycid,
+	    hz * spa->spa_deadman_synctime / NANOSEC);
 #endif
-#endif
+#endif	/* illumos */
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
@@ -6327,23 +6740,37 @@
 	}
 
 	/*
-	 * If anything has changed in this txg, or if someone is waiting
-	 * for this txg to sync (eg, spa_vdev_remove()), push the
-	 * deferred frees from the previous txg.  If not, leave them
-	 * alone so that we don't generate work on an otherwise idle
-	 * system.
+	 * Set the top-level vdev's max queue depth. Evaluate each
+	 * top-level's async write queue depth in case it changed.
+	 * The max queue depth will not change in the middle of syncing
+	 * out this txg.
 	 */
-	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
-	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
-	    ((dsl_scan_active(dp->dp_scan) ||
-	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
-		zio_t *zio = zio_root(spa, NULL, NULL, 0);
-		VERIFY3U(bpobj_iterate(defer_bpo,
-		    spa_free_sync_cb, zio, tx), ==, 0);
-		VERIFY0(zio_wait(zio));
+	uint64_t queue_depth_total = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
+		    !metaslab_group_initialized(mg))
+			continue;
+
+		/*
+		 * It is safe to do a lock-free check here because only async
+		 * allocations look at mg_max_alloc_queue_depth, and async
+		 * allocations all happen from spa_sync().
+		 */
+		ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+		mg->mg_max_alloc_queue_depth = max_queue_depth;
+		queue_depth_total += mg->mg_max_alloc_queue_depth;
 	}
+	metaslab_class_t *mc = spa_normal_class(spa);
+	ASSERT0(refcount_count(&mc->mc_alloc_slots));
+	mc->mc_alloc_max_slots = queue_depth_total;
+	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
+	ASSERT3U(mc->mc_alloc_max_slots, <=,
+	    max_queue_depth * rvd->vdev_children);
+
 	/*
 	 * Iterate to convergence.
 	 */
@@ -6359,13 +6786,15 @@
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free) {
-			zio_t *zio = zio_root(spa, NULL, NULL, 0);
-			bplist_iterate(free_bpl, spa_free_sync_cb,
-			    zio, tx);
-			VERIFY(zio_wait(zio) == 0);
+			spa_sync_frees(spa, free_bpl, tx);
 		} else {
+			/*
+			 * We can not defer frees in pass 1, because
+			 * we sync the deferred frees later in pass 1.
+			 */
+			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
-			    defer_bpo, tx);
+			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
@@ -6374,8 +6803,37 @@
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
-		if (pass == 1)
+		if (pass == 1) {
 			spa_sync_upgrades(spa, tx);
+			ASSERT3U(txg, >=,
+			    spa->spa_uberblock.ub_rootbp.blk_birth);
+			/*
+			 * Note: We need to check if the MOS is dirty
+			 * because we could have marked the MOS dirty
+			 * without updating the uberblock (e.g. if we
+			 * have sync tasks but no dirty user data).  We
+			 * need to check the uberblock's rootbp because
+			 * it is updated if we have synced out dirty
+			 * data (though in this case the MOS will most
+			 * likely also be dirty due to second order
+			 * effects, we don't want to rely on that here).
+			 */
+			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+			    !dmu_objset_is_dirty(mos, txg)) {
+				/*
+				 * Nothing changed on the first pass,
+				 * therefore this TXG is a no-op.  Avoid
+				 * syncing deferred frees, so that we
+				 * can keep this TXG as a no-op.
+				 */
+				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
+				    txg));
+				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+				break;
+			}
+			spa_sync_deferred_frees(spa, tx);
+		}
 
 	} while (dmu_objset_is_dirty(mos, txg));
 
@@ -6409,16 +6867,10 @@
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
-			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
-			if (error != 0)
-				error = vdev_config_sync(svd, svdcount, txg,
-				    B_TRUE);
+			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
-			    rvd->vdev_children, txg, B_FALSE);
-			if (error != 0)
-				error = vdev_config_sync(rvd->vdev_child,
-				    rvd->vdev_children, txg, B_TRUE);
+			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
@@ -6435,11 +6887,11 @@
 
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
-#else	/* FreeBSD */
+#else	/* !illumos */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
 #endif
-#endif
+#endif	/* illumos */
 
 	/*
 	 * Clear the dirty config list.
@@ -6457,10 +6909,12 @@
 		spa->spa_config_syncing = NULL;
 	}
 
-	spa->spa_ubsync = spa->spa_uberblock;
-
 	dsl_pool_sync_done(dp, txg);
 
+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * Update usable space statistics.
 	 */
@@ -6479,6 +6933,13 @@
 
 	spa->spa_sync_pass = 0;
 
+	/*
+	 * Update the last synced uberblock here. We want to do this at
+	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
+	 * will be guaranteed that all the processing associated with
+	 * that txg has been completed.
+	 */
+	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
@@ -6487,6 +6948,7 @@
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
+	spa_async_dispatch_vd(spa);
 }
 
 /*
@@ -6591,7 +7053,7 @@
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
-	ASSERT(version >= spa->spa_uberblock.ub_version);
+	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
@@ -6642,24 +7104,17 @@
 	return (B_FALSE);
 }
 
-/*
- * Post a sysevent corresponding to the given event.  The 'name' must be one of
- * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
- * filled in from the spa and (optionally) the vdev.  This doesn't do anything
- * in the userland libzpool, as we don't want consumers to misinterpret ztest
- * or zdb as real changes.
- */
-void
-spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+static sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, const char *name)
 {
+	sysevent_t		*ev = NULL;
 #ifdef _KERNEL
-	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
-	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
+	ASSERT(ev != NULL);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
@@ -6691,11 +7146,34 @@
 		goto done;
 	attr = NULL;
 
-	(void) log_sysevent(ev, SE_SLEEP, &eid);
-
 done:
 	if (attr)
 		sysevent_free_attr(attr);
+
+#endif
+	return (ev);
+}
+
+static void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+	sysevent_id_t		eid;
+
+	(void) log_sysevent(ev, SE_SLEEP, &eid);
 	sysevent_free(ev);
 #endif
 }
+
+/*
+ * Post a sysevent corresponding to the given event.  The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+	spa_event_post(spa_event_create(spa, vd, name));
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -141,6 +141,26 @@
 	kobj_close_file(file);
 }
 
+static void
+spa_config_clean(nvlist_t *nvl)
+{
+	nvlist_t **child;
+	nvlist_t *nvroot = NULL;
+	uint_t c, children;
+
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			spa_config_clean(child[c]);
+	}
+
+	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
+		spa_config_clean(nvroot);
+
+	nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
+	nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
+}
+
 static int
 spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
 {
@@ -197,7 +217,12 @@
 
 /*
  * Synchronize pool configuration to disk.  This must be called with the
- * namespace lock held.
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicity import
+ * would be required.
  */
 void
 spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
@@ -206,6 +231,7 @@
 	nvlist_t *nvl;
 	boolean_t ccw_failure;
 	int error;
+	char *pool_name;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -229,6 +255,7 @@
 		 */
 		nvl = NULL;
 		while ((spa = spa_next(spa)) != NULL) {
+			nvlist_t *nvroot = NULL;
 			/*
 			 * Skip over our own pool if we're about to remove
 			 * ourselves from the spa namespace or any pool that
@@ -237,7 +264,8 @@
 			 * we don't allow them to be written to the cache file.
 			 */
 			if ((spa == target && removing) ||
-			    !spa_writeable(spa))
+			    (spa_state(spa) == POOL_STATE_ACTIVE &&
+			    !spa_writeable(spa)))
 				continue;
 
 			mutex_enter(&spa->spa_props_lock);
@@ -253,9 +281,19 @@
 				VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME,
 				    KM_SLEEP) == 0);
 
-			VERIFY(nvlist_add_nvlist(nvl, spa->spa_name,
-			    spa->spa_config) == 0);
+			if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+				pool_name = fnvlist_lookup_string(spa->spa_config,
+					ZPOOL_CONFIG_POOL_NAME);
+			} else {
+				pool_name = spa_name(spa);
+			}
+
+			fnvlist_add_nvlist(nvl, pool_name,
+			    spa->spa_config);
 			mutex_exit(&spa->spa_props_lock);
+
+			if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0)
+				spa_config_clean(nvroot);
 		}
 
 		error = spa_config_write(dp, nvl);
@@ -338,8 +376,7 @@
 spa_config_set(spa_t *spa, nvlist_t *config)
 {
 	mutex_enter(&spa->spa_props_lock);
-	if (spa->spa_config != NULL)
-		nvlist_free(spa->spa_config);
+	nvlist_free(spa->spa_config);
 	spa->spa_config = config;
 	mutex_exit(&spa->spa_props_lock);
 }
@@ -358,6 +395,7 @@
 	unsigned long hostid = 0;
 	boolean_t locked = B_FALSE;
 	uint64_t split_guid;
+	char *pool_name;
 
 	if (vd == NULL) {
 		vd = rvd;
@@ -374,22 +412,36 @@
 	if (txg == -1ULL)
 		txg = spa->spa_config_txg;
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	/*
+	 * Originally, users had to handle spa namespace collisions by either
+	 * exporting the already imported pool or by specifying a new name for
+	 * the pool with a conflicting name. In the case of root pools from
+	 * virtual guests, neither approach to collision resolution is
+	 * reasonable. This is addressed by extending the new name syntax with
+	 * an option to specify that the new name is temporary. When specified,
+	 * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
+	 * to use the previous name, which we do below.
+	 */
+	if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+		pool_name = fnvlist_lookup_string(spa->spa_config,
+			ZPOOL_CONFIG_POOL_NAME);
+	} else {
+		pool_name = spa_name(spa);
+	}
 
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-	    spa_version(spa)) == 0);
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-	    spa_name(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-	    spa_state(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    txg) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    spa_guid(spa)) == 0);
-	VERIFY(spa->spa_comment == NULL || nvlist_add_string(config,
-	    ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0);
+	config = fnvlist_alloc();
 
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+	if (spa->spa_comment != NULL) {
+		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+		    spa->spa_comment);
+	}
 
+
 #ifdef	_KERNEL
 	hostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
@@ -515,8 +567,10 @@
 		 */
 		for (c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
-			if (tvd->vdev_ms_array == 0)
+			if (tvd->vdev_ms_array == 0) {
+				vdev_ashift_optimize(tvd);
 				vdev_metaslab_set_size(tvd);
+			}
 			vdev_expand(tvd, txg);
 		}
 	}
@@ -530,8 +584,7 @@
 	/*
 	 * Update the global config cache to reflect the new mosconfig.
 	 */
-	if (!spa->spa_is_root)
-		spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
+	spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
 	if (what == SPA_CONFIG_UPDATE_POOL)
 		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -36,7 +36,7 @@
  * deleted from the log when the scrub completes.
  *
  * The log is stored using a ZAP object whose key is a string form of the
- * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
  * optional 'objset:object' human-readable string describing the data.  When an
  * error is first logged, this string will be empty, indicating that no name is
  * known.  This prevents us from having to issue a potentially large amount of
@@ -60,7 +60,7 @@
  * Convert a bookmark to a string.
  */
 static void
-bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
@@ -72,7 +72,7 @@
  */
 #ifdef _KERNEL
 static void
-name_to_bookmark(char *buf, zbookmark_t *zb)
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = strtonum(buf, &buf);
 	ASSERT(*buf == ':');
@@ -93,7 +93,7 @@
 void
 spa_log_error(spa_t *spa, zio_t *zio)
 {
-	zbookmark_t *zb = &zio->io_logical->io_bookmark;
+	zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
 	avl_tree_t *tree;
@@ -166,7 +166,7 @@
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 
 	if (obj == 0)
 		return (0);
@@ -183,8 +183,8 @@
 		name_to_bookmark(za.za_name, &zb);
 
 		if (copyout(&zb, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0) {
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0) {
 			zap_cursor_fini(&zc);
 			return (SET_ERROR(EFAULT));
 		}
@@ -208,8 +208,8 @@
 			return (SET_ERROR(ENOMEM));
 
 		if (copyout(&se->se_bookmark, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0)
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0)
 			return (SET_ERROR(EFAULT));
 
 		*count -= 1;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/spa.h>
@@ -91,7 +92,7 @@
 
 	ASSERT(spa->spa_history == 0);
 	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-	    SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
 	    sizeof (spa_history_phys_t), tx);
 
 	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
@@ -325,7 +326,7 @@
 
 	/* Kick this off asynchronously; errors are ignored. */
 	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
-	    nvarg, 0, tx);
+	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	/* spa_history_log_sync will free nvl */
@@ -466,7 +467,7 @@
 		spa_history_log_sync(nvl, tx);
 	} else {
 		dsl_sync_task_nowait(spa_get_dsl(spa),
-		    spa_history_log_sync, nvl, 0, tx);
+		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
 	/* spa_history_log_sync() will free nvl */
 }
@@ -501,7 +502,7 @@
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
-	char namebuf[MAXNAMELEN];
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
@@ -520,7 +521,7 @@
     dmu_tx_t *tx, const char *fmt, ...)
 {
 	va_list adx;
-	char namebuf[MAXNAMELEN];
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *nvl = fnvlist_alloc();
 
 	ASSERT(tx != NULL);
@@ -528,7 +529,7 @@
 	dsl_dir_name(dd, namebuf);
 	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
 	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
-	    dd->dd_phys->dd_head_dataset_obj);
+	    dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	va_start(adx, fmt);
 	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,12 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -37,6 +40,7 @@
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
@@ -51,7 +55,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
-#include "zfeature_common.h"
+#include <sys/zfeature.h>
 
 /*
  * SPA locking
@@ -244,37 +248,111 @@
 #else
 int zfs_flags = 0;
 #endif
+SYSCTL_DECL(_debug);
+TUNABLE_INT("debug.zfs_flags", &zfs_flags);
+SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0,
+    "ZFS debug flags.");
 
 /*
  * zfs_recover can be set to nonzero to attempt to recover from
  * otherwise-fatal errors, typically caused by on-disk corruption.  When
  * set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
  */
-int zfs_recover = 0;
+boolean_t zfs_recover = B_FALSE;
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
+SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
-extern int zfs_txg_synctime_ms;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+	int err, val;
 
+	val = zfs_flags;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	/*
+	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+	 * arc buffers in the system have the necessary additional
+	 * checksum data.  However, it is safe to disable at any
+	 * time.
+	 */
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		val &= ~ZFS_DEBUG_MODIFY;
+	zfs_flags = val;
+
+	return (0);
+}
+TUNABLE_INT("vfs.zfs.debugflags", &zfs_flags);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+    sysctl_vfs_zfs_debug_flags, "IU",
+    "Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags).");
+
 /*
- * Expiration time in units of zfs_txg_synctime_ms. This value has two
- * meanings. First it is used to determine when the spa_deadman logic
- * should fire. By default the spa_deadman will fire if spa_sync has
- * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
- * Secondly, the value determines if an I/O is considered "hung".
- * Any I/O that has not completed in zfs_deadman_synctime is considered
- * "hung" resulting in a system panic.
- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress.  While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers.  In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks.  However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug).  In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless.  In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do.  Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
  */
-uint64_t zfs_deadman_synctime = 1000ULL;
-TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN,
-    &zfs_deadman_synctime, 0,
-    "Stalled ZFS I/O expiration time in units of vfs.zfs.txg_synctime_ms");
+boolean_t zfs_free_leak_on_eio = B_FALSE;
 
 /*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in a system panic.
+ */
+uint64_t zfs_deadman_synctime_ms = 1000000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
+    &zfs_deadman_synctime_ms, 0,
+    "Stalled ZFS I/O expiration time in milliseconds");
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+uint64_t zfs_deadman_checktime_ms = 5000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
+    &zfs_deadman_checktime_ms, 0,
+    "Period of checks for stalled ZFS I/O in milliseconds");
+
+/*
  * Default value of -1 for zfs_deadman_enabled is resolved in
  * zfs_deadman_init()
  */
@@ -283,6 +361,20 @@
 SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
     &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
 
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that.  Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
+ * the worst case is:
+ *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+TUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
+    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
+
 #ifndef illumos
 #ifdef _KERNEL
 static void
@@ -304,6 +396,43 @@
 #endif	/* !illumos */
 
 /*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed.  This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space.  If we have
+ * less than this amount of free space, most ZPL operations (e.g. write,
+ * create) will return ENOSPC.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space.  They will only return ENOSPC if less than half
+ * the slop space is free.  Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space.  These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
+ * operations result in a net increase in the amount of space used,
+ * it is possible to run the pool completely out of space, causing it to
+ * be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
+    &spa_slop_shift, 0,
+    "Shift value of reserved space (1/(2^spa_slop_shift)).");
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
+    &spa_min_slop, 0,
+    "Minimal value of reserved space");
+
+/*
  * ==========================================================================
  * SPA config locking
  * ==========================================================================
@@ -345,7 +474,8 @@
 		if (rw == RW_READER) {
 			if (scl->scl_writer || scl->scl_write_wanted) {
 				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks ^ (1 << i), tag);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
 				return (0);
 			}
 		} else {
@@ -352,7 +482,8 @@
 			ASSERT(scl->scl_writer != curthread);
 			if (!refcount_is_zero(&scl->scl_count)) {
 				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks ^ (1 << i), tag);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
 				return (0);
 			}
 			scl->scl_writer = curthread;
@@ -458,7 +589,7 @@
 	 * If it's a full dataset name, figure out the pool name and
 	 * just use that.
 	 */
-	cp = strpbrk(search.spa_name, "/@");
+	cp = strpbrk(search.spa_name, "/@#");
 	if (cp != NULL)
 		*cp = '\0';
 
@@ -472,18 +603,46 @@
  * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
  * looking for potentially hung I/Os.
  */
-void
-spa_deadman(void *arg)
+static void
+spa_deadman(void *arg, int pending)
 {
 	spa_t *spa = arg;
 
+	/*
+	 * Disable the deadman timer if the pool is suspended.
+	 */
+	if (spa_suspended(spa)) {
+#ifdef illumos
+		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else
+		/* Nothing.  just don't schedule any future callouts. */
+#endif
+		return;
+	}
+
 	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 	    ++spa->spa_deadman_calls);
 	if (zfs_deadman_enabled)
 		vdev_deadman(spa->spa_root_vdev);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	callout_schedule(&spa->spa_deadman_cycid,
+	    hz * zfs_deadman_checktime_ms / MILLISEC);
+#endif
+#endif
 }
 
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+spa_deadman_timeout(void *arg)
+{
+	spa_t *spa = arg;
+
+	taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
+}
+#endif
+
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
@@ -506,14 +665,18 @@
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
@@ -535,8 +698,7 @@
 	hdlr.cyh_level = CY_LOW_LEVEL;
 #endif
 
-	spa->spa_deadman_synctime = zfs_deadman_synctime *
-	    zfs_txg_synctime_ms * MICROSEC;
+	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 
 #ifdef illumos
 	/*
@@ -543,9 +705,9 @@
 	 * This determines how often we need to check for hung I/Os after
 	 * the cyclic has already fired. Since checking for hung I/Os is
 	 * an expensive operation we don't want to check too frequently.
-	 * Instead wait for 5 synctimes before checking again.
+	 * Instead wait for 5 seconds before checking again.
 	 */
-	when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 	when.cyt_when = CY_INFINITY;
 	mutex_enter(&cpu_lock);
 	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
@@ -552,7 +714,23 @@
 	mutex_exit(&cpu_lock);
 #else	/* !illumos */
 #ifdef _KERNEL
-	callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
+	/*
+	 * callout(9) does not provide a way to initialize a callout with
+	 * a function and an argument, so we use callout_reset() to schedule
+	 * the callout in the very distant future.  Even if that event ever
+	 * fires, it should be okayas we won't have any active zio-s.
+	 * But normally spa_sync() will reschedule the callout with a proper
+	 * timeout.
+	 * callout(9) does not allow the callback function to sleep but
+	 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
+	 * emulated using sx(9).  For this reason spa_deadman_timeout()
+	 * will schedule spa_deadman() as task on a taskqueue that allows
+	 * sleeping.
+	 */
+	TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
+	callout_init(&spa->spa_deadman_cycid, 1);
+	callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
+	    spa_deadman_timeout, spa, 0);
 #endif
 #endif
 	refcount_create(&spa->spa_refcount);
@@ -568,6 +746,9 @@
 		spa_active_count++;
 	}
 
+	avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
+	    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+
 	/*
 	 * Every pool starts with the default cachefile
 	 */
@@ -600,6 +781,18 @@
 
 	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
 
+	spa->spa_min_ashift = INT_MAX;
+	spa->spa_max_ashift = 0;
+
+	/*
+	 * As a pool is being created, treat all features as disabled by
+	 * setting SPA_FEATURE_DISABLED for all entries in the feature
+	 * refcount cache.
+	 */
+	for (int i = 0; i < SPA_FEATURES; i++) {
+		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+	}
+
 	return (spa);
 }
 
@@ -615,6 +808,7 @@
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
 
 	nvlist_free(spa->spa_config_splitting);
 
@@ -633,6 +827,7 @@
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
+	avl_destroy(&spa->spa_alloc_tree);
 	list_destroy(&spa->spa_config_list);
 
 	nvlist_free(spa->spa_label_features);
@@ -648,6 +843,7 @@
 #else	/* !illumos */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
+	taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
 #endif
 #endif
 
@@ -658,17 +854,23 @@
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_destroy(&spa->spa_free_bplist[t]);
 
+	zio_checksum_templates_free(spa);
+
 	cv_destroy(&spa->spa_async_cv);
+	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
+	mutex_destroy(&spa->spa_alloc_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
+	mutex_destroy(&spa->spa_evicting_os_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
@@ -722,6 +924,20 @@
 }
 
 /*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released.  Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs.  The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+	(void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
@@ -1049,7 +1265,7 @@
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
@@ -1157,17 +1373,27 @@
  */
 
 void
-spa_activate_mos_feature(spa_t *spa, const char *feature)
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
 {
-	(void) nvlist_add_boolean(spa->spa_label_features, feature);
-	vdev_config_dirty(spa->spa_root_vdev);
+	if (!nvlist_exists(spa->spa_label_features, feature)) {
+		fnvlist_add_boolean(spa->spa_label_features, feature);
+		/*
+		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+		 * dirty the vdev config because lock SCL_CONFIG is not held.
+		 * Thankfully, in this case we don't need to dirty the config
+		 * because it will be written out anyway when we finish
+		 * creating the pool.
+		 */
+		if (tx->tx_txg != TXG_INITIAL)
+			vdev_config_dirty(spa->spa_root_vdev);
+	}
 }
 
 void
 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
 {
-	(void) nvlist_remove_all(spa->spa_label_features, feature);
-	vdev_config_dirty(spa->spa_root_vdev);
+	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+		vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
@@ -1318,7 +1544,7 @@
 }
 
 void
-sprintf_blkptr(char *buf, const blkptr_t *bp)
+snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
 	char type[256];
 	char *checksum = NULL;
@@ -1336,11 +1562,15 @@
 			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
 			    sizeof (type));
 		}
-		checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+		if (!BP_IS_EMBEDDED(bp)) {
+			checksum =
+			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+		}
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
 
-	SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
+	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+	    compress);
 }
 
 void
@@ -1535,17 +1765,24 @@
 uint64_t
 spa_get_asize(spa_t *spa, uint64_t lsize)
 {
-	/*
-	 * The worst case is single-sector max-parity RAID-Z blocks, in which
-	 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
-	 * times the size; so just assume that.  Add to this the fact that
-	 * we can have up to 3 DVAs per bp, and one more factor of 2 because
-	 * the block may be dittoed with up to 3 DVAs by ddt_sync().
-	 */
-	return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
+	return (lsize * spa_asize_inflation);
 }
 
+/*
+ * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
+ * or at least 128MB, unless that would cause it to be more than half the
+ * pool size.
+ *
+ * See the comment above spa_slop_shift for details.
+ */
 uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+	uint64_t space = spa_get_dspace(spa);
+	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+}
+
+uint64_t
 spa_get_dspace(spa_t *spa)
 {
 	return (spa->spa_dspace);
@@ -1598,6 +1835,34 @@
 	return (spa->spa_log_class);
 }
 
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_insert_head(&spa->spa_evicting_os_list, os);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_remove(&spa->spa_evicting_os_list, os);
+	cv_broadcast(&spa->spa_evicting_os_cv);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	while (!list_is_empty(&spa->spa_evicting_os_list))
+		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+	mutex_exit(&spa->spa_evicting_os_lock);
+
+	dmu_buf_user_evict_wait();
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1632,7 +1897,13 @@
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (asize != 0 && spa->spa_deflate) {
-		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+		uint64_t vdev = DVA_GET_VDEV(dva);
+		vdev_t *vd = vdev_lookup_top(spa, vdev);
+		if (vd == NULL) {
+			panic(
+			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
+			    (u_longlong_t)vdev, (u_longlong_t)asize);
+		}
 		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
 
@@ -1644,7 +1915,7 @@
 {
 	uint64_t dsize = 0;
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	return (dsize);
@@ -1657,7 +1928,7 @@
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
@@ -1698,6 +1969,10 @@
 	spa_config_load();
 }
 
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
 void
 spa_init(int mode)
 {
@@ -1734,11 +2009,13 @@
 #endif /* illumos */
 	refcount_sysinit();
 	unique_init();
-	space_map_init();
+	range_tree_init();
 	zio_init();
+	lz4_init();
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
+	vdev_file_init();
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
@@ -1758,11 +2035,13 @@
 
 	spa_evict_all();
 
+	vdev_file_fini();
 	vdev_cache_stat_fini();
 	zil_fini();
 	dmu_fini();
+	lz4_fini();
 	zio_fini();
-	space_map_fini();
+	range_tree_fini();
 	unique_fini();
 	refcount_fini();
 
@@ -1811,6 +2090,16 @@
 	return (!!(spa->spa_mode & FWRITE));
 }
 
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+}
+
 int
 spa_mode(spa_t *spa)
 {
@@ -1888,3 +2177,12 @@
 {
 	return (spa->spa_debug);
 }
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SPA_MAXBLOCKSIZE);
+	else
+		return (SPA_OLD_MAXBLOCKSIZE);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,323 +24,66 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
 #include <sys/zio.h>
 #include <sys/space_map.h>
+#include <sys/refcount.h>
+#include <sys/zfeature.h>
 
-static kmem_cache_t *space_seg_cache;
+SYSCTL_DECL(_vfs_zfs);
 
-void
-space_map_init(void)
-{
-	ASSERT(space_seg_cache == NULL);
-	space_seg_cache = kmem_cache_create("space_seg_cache",
-	    sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-space_map_fini(void)
-{
-	kmem_cache_destroy(space_seg_cache);
-	space_seg_cache = NULL;
-}
-
 /*
- * Space map routines.
- * NOTE: caller is responsible for all locking.
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
  */
-static int
-space_map_seg_compare(const void *x1, const void *x2)
-{
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
+int space_map_blksz = (1 << 12);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
+    "Maximum block size for space map.  Must be power of 2 and greater than 4096.");
 
-	if (s1->ss_start < s2->ss_start) {
-		if (s1->ss_end > s2->ss_start)
-			return (0);
-		return (-1);
-	}
-	if (s1->ss_start > s2->ss_start) {
-		if (s1->ss_start < s2->ss_end)
-			return (0);
-		return (1);
-	}
-	return (0);
-}
-
-void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
-	kmutex_t *lp)
-{
-	bzero(sm, sizeof (*sm));
-
-	cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
-
-	avl_create(&sm->sm_root, space_map_seg_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
-
-	sm->sm_start = start;
-	sm->sm_size = size;
-	sm->sm_shift = shift;
-	sm->sm_lock = lp;
-}
-
-void
-space_map_destroy(space_map_t *sm)
-{
-	ASSERT(!sm->sm_loaded && !sm->sm_loading);
-	VERIFY0(sm->sm_space);
-	avl_destroy(&sm->sm_root);
-	cv_destroy(&sm->sm_load_cv);
-}
-
-void
-space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t *ss_before, *ss_after, *ss;
-	uint64_t end = start + size;
-	int merge_before, merge_after;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(!sm->sm_condensing);
-	VERIFY(size != 0);
-	VERIFY3U(start, >=, sm->sm_start);
-	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
-	VERIFY(sm->sm_space + size <= sm->sm_size);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ss = space_map_find(sm, start, size, &where);
-	if (ss != NULL) {
-		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-
-	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(ss == NULL);
-
-	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
-	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
-
-	merge_before = (ss_before != NULL && ss_before->ss_end == start);
-	merge_after = (ss_after != NULL && ss_after->ss_start == end);
-
-	if (merge_before && merge_after) {
-		avl_remove(&sm->sm_root, ss_before);
-		if (sm->sm_pp_root) {
-			avl_remove(sm->sm_pp_root, ss_before);
-			avl_remove(sm->sm_pp_root, ss_after);
-		}
-		ss_after->ss_start = ss_before->ss_start;
-		kmem_cache_free(space_seg_cache, ss_before);
-		ss = ss_after;
-	} else if (merge_before) {
-		ss_before->ss_end = end;
-		if (sm->sm_pp_root)
-			avl_remove(sm->sm_pp_root, ss_before);
-		ss = ss_before;
-	} else if (merge_after) {
-		ss_after->ss_start = start;
-		if (sm->sm_pp_root)
-			avl_remove(sm->sm_pp_root, ss_after);
-		ss = ss_after;
-	} else {
-		ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
-		ss->ss_start = start;
-		ss->ss_end = end;
-		avl_insert(&sm->sm_root, ss, where);
-	}
-
-	if (sm->sm_pp_root)
-		avl_add(sm->sm_pp_root, ss);
-
-	sm->sm_space += size;
-}
-
-void
-space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
-{
-#ifdef illumos
-	avl_index_t where;
-#endif
-	space_seg_t *ss, *newseg;
-	uint64_t end = start + size;
-	int left_over, right_over;
-
-	VERIFY(!sm->sm_condensing);
-#ifdef illumos
-	ss = space_map_find(sm, start, size, &where);
-#else
-	ss = space_map_find(sm, start, size, NULL);
-#endif
-
-	/* Make sure we completely overlap with someone */
-	if (ss == NULL) {
-		zfs_panic_recover("zfs: freeing free segment "
-		    "(offset=%llu size=%llu)",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-	VERIFY3U(ss->ss_start, <=, start);
-	VERIFY3U(ss->ss_end, >=, end);
-	VERIFY(sm->sm_space - size < sm->sm_size);
-
-	left_over = (ss->ss_start != start);
-	right_over = (ss->ss_end != end);
-
-	if (sm->sm_pp_root)
-		avl_remove(sm->sm_pp_root, ss);
-
-	if (left_over && right_over) {
-		newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
-		newseg->ss_start = end;
-		newseg->ss_end = ss->ss_end;
-		ss->ss_end = start;
-		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
-		if (sm->sm_pp_root)
-			avl_add(sm->sm_pp_root, newseg);
-	} else if (left_over) {
-		ss->ss_end = start;
-	} else if (right_over) {
-		ss->ss_start = end;
-	} else {
-		avl_remove(&sm->sm_root, ss);
-		kmem_cache_free(space_seg_cache, ss);
-		ss = NULL;
-	}
-
-	if (sm->sm_pp_root && ss != NULL)
-		avl_add(sm->sm_pp_root, ss);
-
-	sm->sm_space -= size;
-}
-
-space_seg_t *
-space_map_find(space_map_t *sm, uint64_t start, uint64_t size,
-    avl_index_t *wherep)
-{
-	space_seg_t ssearch, *ss;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = start + size;
-	ss = avl_find(&sm->sm_root, &ssearch, wherep);
-
-	if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size)
-		return (ss);
-	return (NULL);
-}
-
-boolean_t
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-
-	return (space_map_find(sm, start, size, &where) != 0);
-}
-
-void
-space_map_swap(space_map_t **msrc, space_map_t **mdst)
-{
-	space_map_t *sm;
-
-	ASSERT(MUTEX_HELD((*msrc)->sm_lock));
-	ASSERT0((*mdst)->sm_space);
-	ASSERT0(avl_numnodes(&(*mdst)->sm_root));
-
-	sm = *msrc;
-	*msrc = *mdst;
-	*mdst = sm;
-}
-
-void
-space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-	void *cookie = NULL;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
-		if (func != NULL)
-			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-		kmem_cache_free(space_seg_cache, ss);
-	}
-	sm->sm_space = 0;
-}
-
-void
-space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
 /*
- * Wait for any in-progress space_map_load() to complete.
- */
-void
-space_map_load_wait(space_map_t *sm)
-{
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while (sm->sm_loading) {
-		ASSERT(!sm->sm_loaded);
-		cv_wait(&sm->sm_load_cv, sm->sm_lock);
-	}
-}
-
-/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ *
  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
  * The caller must be OK with this.
  */
 int
-space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os)
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
 {
 	uint64_t *entry, *entry_map, *entry_map_end;
 	uint64_t bufsize, size, offset, end, space;
-	uint64_t mapstart = sm->sm_start;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT(!sm->sm_loaded);
-	ASSERT(!sm->sm_loading);
 
-	sm->sm_loading = B_TRUE;
-	end = smo->smo_objsize;
-	space = smo->smo_alloc;
+	end = space_map_length(sm);
+	space = space_map_allocated(sm);
 
-	ASSERT(sm->sm_ops == NULL);
-	VERIFY0(sm->sm_space);
+	VERIFY0(range_tree_space(rt));
 
 	if (maptype == SM_FREE) {
-		space_map_add(sm, sm->sm_start, sm->sm_size);
+		range_tree_add(rt, sm->sm_start, sm->sm_size);
 		space = sm->sm_size - space;
 	}
 
-	bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
 	entry_map = zio_buf_alloc(bufsize);
 
 	mutex_exit(sm->sm_lock);
-	if (end > bufsize)
-		dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+	if (end > bufsize) {
+		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
+	}
 	mutex_enter(sm->sm_lock);
 
 	for (offset = 0; offset < end; offset += bufsize) {
@@ -347,13 +90,14 @@
 		size = MIN(end - offset, bufsize);
 		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
 		VERIFY(size != 0);
+		ASSERT3U(sm->sm_blksz, !=, 0);
 
 		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    smo->smo_object, offset, size);
+		    space_map_object(sm), offset, size);
 
 		mutex_exit(sm->sm_lock);
-		error = dmu_read(os, smo->smo_object, offset, size, entry_map,
-		    DMU_READ_PREFETCH);
+		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
+		    entry_map, DMU_READ_PREFETCH);
 		mutex_enter(sm->sm_lock);
 		if (error != 0)
 			break;
@@ -361,115 +105,177 @@
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
 			uint64_t e = *entry;
+			uint64_t offset, size;
 
 			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
 				continue;
 
-			(SM_TYPE_DECODE(e) == maptype ?
-			    space_map_add : space_map_remove)(sm,
-			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
-			    SM_RUN_DECODE(e) << sm->sm_shift);
+			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+			    sm->sm_start;
+			size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
+			VERIFY3U(offset, >=, sm->sm_start);
+			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
+			if (SM_TYPE_DECODE(e) == maptype) {
+				VERIFY3U(range_tree_space(rt) + size, <=,
+				    sm->sm_size);
+				range_tree_add(rt, offset, size);
+			} else {
+				range_tree_remove(rt, offset, size);
+			}
 		}
 	}
 
-	if (error == 0) {
-		VERIFY3U(sm->sm_space, ==, space);
+	if (error == 0)
+		VERIFY3U(range_tree_space(rt), ==, space);
+	else
+		range_tree_vacate(rt, NULL, NULL);
 
-		sm->sm_loaded = B_TRUE;
-		sm->sm_ops = ops;
-		if (ops != NULL)
-			ops->smop_load(sm);
-	} else {
-		space_map_vacate(sm, NULL, NULL);
-	}
-
 	zio_buf_free(entry_map, bufsize);
+	return (error);
+}
 
-	sm->sm_loading = B_FALSE;
+void
+space_map_histogram_clear(space_map_t *sm)
+{
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
 
-	cv_broadcast(&sm->sm_load_cv);
+	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+}
 
-	return (error);
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
+{
+	/*
+	 * Verify that the in-core range tree does not have any
+	 * ranges smaller than our sm_shift size.
+	 */
+	for (int i = 0; i < sm->sm_shift; i++) {
+		if (rt->rt_histogram[i] != 0)
+			return (B_FALSE);
+	}
+	return (B_TRUE);
 }
 
 void
-space_map_unload(space_map_t *sm)
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
 {
-	ASSERT(MUTEX_HELD(sm->sm_lock));
+	int idx = 0;
 
-	if (sm->sm_loaded && sm->sm_ops != NULL)
-		sm->sm_ops->smop_unload(sm);
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	ASSERT(dmu_tx_is_syncing(tx));
+	VERIFY3U(space_map_object(sm), !=, 0);
 
-	sm->sm_loaded = B_FALSE;
-	sm->sm_ops = NULL;
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
 
-	space_map_vacate(sm, NULL, NULL);
-}
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
-uint64_t
-space_map_maxsize(space_map_t *sm)
-{
-	ASSERT(sm->sm_ops != NULL);
-	return (sm->sm_ops->smop_max(sm));
+	ASSERT(space_map_histogram_verify(sm, rt));
+
+	/*
+	 * Transfer the content of the range tree histogram to the space
+	 * map histogram. The space map histogram contains 32 buckets ranging
+	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+	 * however, can represent ranges from 2^0 to 2^63. Since the space
+	 * map only cares about allocatable blocks (minimum of sm_shift) we
+	 * can safely ignore all ranges in the range tree smaller than sm_shift.
+	 */
+	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+		/*
+		 * Since the largest histogram bucket in the space map is
+		 * 2^(32+sm_shift-1), we need to normalize the values in
+		 * the range tree for any bucket larger than that size. For
+		 * example given an sm_shift of 9, ranges larger than 2^40
+		 * would get normalized as if they were 1TB ranges. Assume
+		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+		 * the calculation below would normalize this to 5 * 2^4 (16).
+		 */
+		ASSERT3U(i, >=, idx + sm->sm_shift);
+		sm->sm_phys->smp_histogram[idx] +=
+		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+		/*
+		 * Increment the space map's index as long as we haven't
+		 * reached the maximum bucket size. Accumulate all ranges
+		 * larger than the max bucket size into the last bucket.
+		 */
+		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+			ASSERT3U(idx + sm->sm_shift, ==, i);
+			idx++;
+			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+		}
+	}
 }
 
 uint64_t
-space_map_alloc(space_map_t *sm, uint64_t size)
+space_map_entries(space_map_t *sm, range_tree_t *rt)
 {
-	uint64_t start;
+	avl_tree_t *t = &rt->rt_root;
+	range_seg_t *rs;
+	uint64_t size, entries;
 
-	start = sm->sm_ops->smop_alloc(sm, size);
-	if (start != -1ULL)
-		space_map_remove(sm, start, size);
-	return (start);
-}
+	/*
+	 * All space_maps always have a debug entry so account for it here.
+	 */
+	entries = 1;
 
-void
-space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	sm->sm_ops->smop_claim(sm, start, size);
-	space_map_remove(sm, start, size);
+	/*
+	 * Traverse the range tree and calculate the number of space map
+	 * entries that would be required to write out the range tree.
+	 */
+	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		entries += howmany(size, SM_RUN_MAX);
+	}
+	return (entries);
 }
 
-void
-space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	space_map_add(sm, start, size);
-	sm->sm_ops->smop_free(sm, start, size);
-}
-
 /*
- * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ * Note: space_map_write() will drop sm_lock across dmu_write() calls.
  */
 void
-space_map_sync(space_map_t *sm, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    dmu_tx_t *tx)
 {
+	objset_t *os = sm->sm_os;
 	spa_t *spa = dmu_objset_spa(os);
-	avl_tree_t *t = &sm->sm_root;
-	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len, total, sm_space, nodes;
+	avl_tree_t *t = &rt->rt_root;
+	range_seg_t *rs;
+	uint64_t size, total, rt_space, nodes;
 	uint64_t *entry, *entry_map, *entry_map_end;
+	uint64_t expected_entries, actual_entries = 1;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	VERIFY3U(space_map_object(sm), !=, 0);
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
-	if (sm->sm_space == 0)
+	/*
+	 * This field is no longer necessary since the in-core space map
+	 * now contains the object number but is maintained for backwards
+	 * compatibility.
+	 */
+	sm->sm_phys->smp_object = sm->sm_object;
+
+	if (range_tree_space(rt) == 0) {
+		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 		return;
+	}
 
-	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
-	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
-	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
-	    sm->sm_space);
-
 	if (maptype == SM_ALLOC)
-		smo->smo_alloc += sm->sm_space;
+		sm->sm_phys->smp_alloc += range_tree_space(rt);
 	else
-		smo->smo_alloc -= sm->sm_space;
+		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
-	bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
-	entry_map = zio_buf_alloc(bufsize);
-	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+	expected_entries = space_map_entries(sm, rt);
+
+	entry_map = zio_buf_alloc(sm->sm_blksz);
+	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
 	entry = entry_map;
 
 	*entry++ = SM_DEBUG_ENCODE(1) |
@@ -478,24 +284,28 @@
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
 	total = 0;
-	nodes = avl_numnodes(&sm->sm_root);
-	sm_space = sm->sm_space;
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-		size = ss->ss_end - ss->ss_start;
-		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+	nodes = avl_numnodes(&rt->rt_root);
+	rt_space = range_tree_space(rt);
+	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		uint64_t start;
 
-		total += size;
-		size >>= sm->sm_shift;
+		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
 
-		while (size) {
+		total += size << sm->sm_shift;
+
+		while (size != 0) {
+			uint64_t run_len;
+
 			run_len = MIN(size, SM_RUN_MAX);
 
 			if (entry == entry_map_end) {
-				mutex_exit(sm->sm_lock);
-				dmu_write(os, smo->smo_object, smo->smo_objsize,
-				    bufsize, entry_map, tx);
-				mutex_enter(sm->sm_lock);
-				smo->smo_objsize += bufsize;
+				mutex_exit(rt->rt_lock);
+				dmu_write(os, space_map_object(sm),
+				    sm->sm_phys->smp_objsize, sm->sm_blksz,
+				    entry_map, tx);
+				mutex_enter(rt->rt_lock);
+				sm->sm_phys->smp_objsize += sm->sm_blksz;
 				entry = entry_map;
 			}
 
@@ -505,162 +315,234 @@
 
 			start += run_len;
 			size -= run_len;
+			actual_entries++;
 		}
 	}
 
 	if (entry != entry_map) {
 		size = (entry - entry_map) * sizeof (uint64_t);
-		mutex_exit(sm->sm_lock);
-		dmu_write(os, smo->smo_object, smo->smo_objsize,
+		mutex_exit(rt->rt_lock);
+		dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
 		    size, entry_map, tx);
-		mutex_enter(sm->sm_lock);
-		smo->smo_objsize += size;
+		mutex_enter(rt->rt_lock);
+		sm->sm_phys->smp_objsize += size;
 	}
+	ASSERT3U(expected_entries, ==, actual_entries);
 
 	/*
 	 * Ensure that the space_map's accounting wasn't changed
 	 * while we were in the middle of writing it out.
 	 */
-	VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root));
-	VERIFY3U(sm->sm_space, ==, sm_space);
-	VERIFY3U(sm->sm_space, ==, total);
+	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+	VERIFY3U(range_tree_space(rt), ==, rt_space);
+	VERIFY3U(range_tree_space(rt), ==, total);
 
-	zio_buf_free(entry_map, bufsize);
+	zio_buf_free(entry_map, sm->sm_blksz);
 }
 
-void
-space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+static int
+space_map_open_impl(space_map_t *sm)
 {
-	VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+	int error;
+	u_longlong_t blocks;
 
-	smo->smo_objsize = 0;
-	smo->smo_alloc = 0;
+	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+	if (error)
+		return (error);
+
+	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+	sm->sm_phys = sm->sm_dbuf->db_data;
+	return (0);
 }
 
-/*
- * Space map reference trees.
- *
- * A space map is a collection of integers.  Every integer is either
- * in the map, or it's not.  A space map reference tree generalizes
- * the idea: it allows its members to have arbitrary reference counts,
- * as opposed to the implicit reference count of 0 or 1 in a space map.
- * This representation comes in handy when computing the union or
- * intersection of multiple space maps.  For example, the union of
- * N space maps is the subset of the reference tree with refcnt >= 1.
- * The intersection of N space maps is the subset with refcnt >= N.
- *
- * [It's very much like a Fourier transform.  Unions and intersections
- * are hard to perform in the 'space map domain', so we convert the maps
- * into the 'reference count domain', where it's trivial, then invert.]
- *
- * vdev_dtl_reassess() uses computations of this form to determine
- * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
- * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
- * has an outage wherever refcnt >= vdev_children.
- */
-static int
-space_map_ref_compare(const void *x1, const void *x2)
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+    uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
 {
-	const space_ref_t *sr1 = x1;
-	const space_ref_t *sr2 = x2;
+	space_map_t *sm;
+	int error;
 
-	if (sr1->sr_offset < sr2->sr_offset)
-		return (-1);
-	if (sr1->sr_offset > sr2->sr_offset)
-		return (1);
+	ASSERT(*smp == NULL);
+	ASSERT(os != NULL);
+	ASSERT(object != 0);
 
-	if (sr1 < sr2)
-		return (-1);
-	if (sr1 > sr2)
-		return (1);
+	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
 
+	sm->sm_start = start;
+	sm->sm_size = size;
+	sm->sm_shift = shift;
+	sm->sm_lock = lp;
+	sm->sm_os = os;
+	sm->sm_object = object;
+
+	error = space_map_open_impl(sm);
+	if (error != 0) {
+		space_map_close(sm);
+		return (error);
+	}
+
+	*smp = sm;
+
 	return (0);
 }
 
 void
-space_map_ref_create(avl_tree_t *t)
+space_map_close(space_map_t *sm)
 {
-	avl_create(t, space_map_ref_compare,
-	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+	if (sm == NULL)
+		return;
+
+	if (sm->sm_dbuf != NULL)
+		dmu_buf_rele(sm->sm_dbuf, sm);
+	sm->sm_dbuf = NULL;
+	sm->sm_phys = NULL;
+
+	kmem_free(sm, sizeof (*sm));
 }
 
 void
-space_map_ref_destroy(avl_tree_t *t)
+space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 {
-	space_ref_t *sr;
-	void *cookie = NULL;
+	objset_t *os = sm->sm_os;
+	spa_t *spa = dmu_objset_spa(os);
+	dmu_object_info_t doi;
 
-	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(sr, sizeof (*sr));
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(dmu_tx_is_syncing(tx));
 
-	avl_destroy(t);
-}
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
 
-static void
-space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
-{
-	space_ref_t *sr;
+	/*
+	 * If the space map has the wrong bonus size (because
+	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+	 * the wrong block size (because space_map_blksz has changed),
+	 * free and re-allocate its object with the updated sizes.
+	 *
+	 * Otherwise, just truncate the current object.
+	 */
+	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+	    doi.doi_data_block_size != space_map_blksz) {
+		zfs_dbgmsg("txg %llu, spa %s, reallocating: "
+		    "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
+		    spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
 
-	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
-	sr->sr_offset = offset;
-	sr->sr_refcnt = refcnt;
+		space_map_free(sm, tx);
+		dmu_buf_rele(sm->sm_dbuf, sm);
 
-	avl_add(t, sr);
-}
+		sm->sm_object = space_map_alloc(sm->sm_os, tx);
+		VERIFY0(space_map_open_impl(sm));
+	} else {
+		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
 
-void
-space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
-	int64_t refcnt)
-{
-	space_map_ref_add_node(t, start, refcnt);
-	space_map_ref_add_node(t, end, -refcnt);
+		/*
+		 * If the spacemap is reallocated, its histogram
+		 * will be reset.  Do the same in the common case so that
+		 * bugs related to the uncommon case do not go unnoticed.
+		 */
+		bzero(sm->sm_phys->smp_histogram,
+		    sizeof (sm->sm_phys->smp_histogram));
+	}
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+	sm->sm_phys->smp_objsize = 0;
+	sm->sm_phys->smp_alloc = 0;
 }
 
 /*
- * Convert (or add) a space map into a reference tree.
+ * Update the in-core space_map allocation and length values.
  */
 void
-space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+space_map_update(space_map_t *sm)
 {
-	space_seg_t *ss;
+	if (sm == NULL)
+		return;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+	sm->sm_alloc = sm->sm_phys->smp_alloc;
+	sm->sm_length = sm->sm_phys->smp_objsize;
 }
 
-/*
- * Convert a reference tree into a space map.  The space map will contain
- * all members of the reference tree for which refcnt >= minref.
- */
+uint64_t
+space_map_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	uint64_t object;
+	int bonuslen;
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+		bonuslen = sizeof (space_map_phys_t);
+		ASSERT3U(bonuslen, <=, dmu_bonus_max());
+	} else {
+		bonuslen = SPACE_MAP_SIZE_V0;
+	}
+
+	object = dmu_object_alloc(os,
+	    DMU_OT_SPACE_MAP, space_map_blksz,
+	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+
+	return (object);
+}
+
 void
-space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
 {
-	uint64_t start = -1ULL;
-	int64_t refcnt = 0;
-	space_ref_t *sr;
+	spa_t *spa;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
+	if (sm == NULL)
+		return;
 
-	space_map_vacate(sm, NULL, NULL);
+	spa = dmu_objset_spa(sm->sm_os);
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		dmu_object_info_t doi;
 
-	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
-		refcnt += sr->sr_refcnt;
-		if (refcnt >= minref) {
-			if (start == -1ULL) {
-				start = sr->sr_offset;
-			}
-		} else {
-			if (start != -1ULL) {
-				uint64_t end = sr->sr_offset;
-				ASSERT(start <= end);
-				if (end > start)
-					space_map_add(sm, start, end - start);
-				start = -1ULL;
-			}
+		dmu_object_info_from_db(sm->sm_dbuf, &doi);
+		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+			VERIFY(spa_feature_is_active(spa,
+			    SPA_FEATURE_SPACEMAP_HISTOGRAM));
+			spa_feature_decr(spa,
+			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 		}
 	}
-	ASSERT(refcnt == 0);
-	ASSERT(start == -1ULL);
+
+	VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
+	sm->sm_object = 0;
 }
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_object : 0);
+}
+
+/*
+ * Returns the already synced, on-disk allocated space.
+ */
+uint64_t
+space_map_allocated(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_alloc : 0);
+}
+
+/*
+ * Returns the already synced, on-disk length;
+ */
+uint64_t
+space_map_length(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_length : 0);
+}
+
+/*
+ * Returns the allocated space that is currently syncing.
+ */
+int64_t
+space_map_alloc_delta(space_map_t *sm)
+{
+	if (sm == NULL)
+		return (0);
+	ASSERT(sm->sm_dbuf != NULL);
+	return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -37,38 +38,104 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define	ARC_EVICT_ALL	-1ULL
+
+#define	HDR_SET_LSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_SET_PSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_GET_LSIZE(hdr)	((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
+#define	HDR_GET_PSIZE(hdr)	((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
-typedef int arc_evict_func_t(void *priv);
 
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
 arc_done_func_t arc_getbuf_func;
 
+extern int zfs_arc_num_sublists_per_state;
+
+typedef enum arc_flags
+{
+	/*
+	 * Public flags that can be passed into the ARC by external consumers.
+	 */
+	ARC_FLAG_WAIT			= 1 << 0,	/* perform sync I/O */
+	ARC_FLAG_NOWAIT			= 1 << 1,	/* perform async I/O */
+	ARC_FLAG_PREFETCH		= 1 << 2,	/* I/O is a prefetch */
+	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
+	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
+	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 5,	/* I/O from zfetch */
+
+	/*
+	 * Private ARC flags.  These flags are private ARC only flags that
+	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
+	 * only be set by ARC code.
+	 */
+	ARC_FLAG_IN_HASH_TABLE		= 1 << 6,	/* buffer is hashed */
+	ARC_FLAG_IO_IN_PROGRESS		= 1 << 7,	/* I/O in progress */
+	ARC_FLAG_IO_ERROR		= 1 << 8,	/* I/O failed for buf */
+	ARC_FLAG_INDIRECT		= 1 << 9,	/* indirect block */
+	/* Indicates that block was read with ASYNC priority. */
+	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 10,
+	ARC_FLAG_L2_WRITING		= 1 << 11,	/* write in progress */
+	ARC_FLAG_L2_EVICTED		= 1 << 12,	/* evicted during I/O */
+	ARC_FLAG_L2_WRITE_HEAD		= 1 << 13,	/* head of write list */
+	/* indicates that the buffer contains metadata (otherwise, data) */
+	ARC_FLAG_BUFC_METADATA		= 1 << 14,
+
+	/* Flags specifying whether optional hdr struct fields are defined */
+	ARC_FLAG_HAS_L1HDR		= 1 << 15,
+	ARC_FLAG_HAS_L2HDR		= 1 << 16,
+
+	/*
+	 * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
+	 * This allows the l2arc to use the blkptr's checksum to verify
+	 * the data without having to store the checksum in the hdr.
+	 */
+	ARC_FLAG_COMPRESSED_ARC		= 1 << 17,
+	ARC_FLAG_SHARED_DATA		= 1 << 18,
+
+	/*
+	 * The arc buffer's compression mode is stored in the top 7 bits of the
+	 * flags field, so these dummy flags are included so that MDB can
+	 * interpret the enum properly.
+	 */
+	ARC_FLAG_COMPRESS_0		= 1 << 24,
+	ARC_FLAG_COMPRESS_1		= 1 << 25,
+	ARC_FLAG_COMPRESS_2		= 1 << 26,
+	ARC_FLAG_COMPRESS_3		= 1 << 27,
+	ARC_FLAG_COMPRESS_4		= 1 << 28,
+	ARC_FLAG_COMPRESS_5		= 1 << 29,
+	ARC_FLAG_COMPRESS_6		= 1 << 30
+
+} arc_flags_t;
+
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
 	kmutex_t		b_evict_lock;
 	void			*b_data;
-	arc_evict_func_t	*b_efunc;
-	void			*b_private;
 };
 
 typedef enum arc_buf_contents {
+	ARC_BUFC_INVALID,			/* invalid type */
 	ARC_BUFC_DATA,				/* buffer contains data */
 	ARC_BUFC_METADATA,			/* buffer contains metadata */
 	ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */
-#define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
-#define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
-#define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
-#define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */
-#define	ARC_L2COMPRESS	(1 << 6)	/* compress in L2ARC */
 
 /*
  * The following breakdows of arc_size exist for kstat only.
@@ -75,6 +142,7 @@
  */
 typedef enum arc_space_type {
 	ARC_SPACE_DATA,
+	ARC_SPACE_META,
 	ARC_SPACE_HDRS,
 	ARC_SPACE_L2HDRS,
 	ARC_SPACE_OTHER,
@@ -83,42 +151,37 @@
 
 void arc_space_consume(uint64_t space, arc_space_type_t type);
 void arc_space_return(uint64_t space, arc_space_type_t type);
-void *arc_data_buf_alloc(uint64_t space);
-void arc_data_buf_free(void *buf, uint64_t space);
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
     arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+void arc_buf_destroy(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
-boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
 
 int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *priv, int priority, int flags,
-    uint32_t *arc_flags, const zbookmark_t *zb);
+    arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *priv, int priority, int zio_flags, const zbookmark_t *zb);
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *child_ready,
+    arc_done_func_t *physdone, arc_done_func_t *done,
+    void *priv, zio_priority_t priority, int zio_flags,
+    const zbookmark_phys_t *zb);
 void arc_freed(spa_t *spa, const blkptr_t *bp);
 
-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
-int arc_buf_evict(arc_buf_t *buf);
-
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
+uint64_t arc_max_bytes(void);
 void arc_init(void);
 void arc_fini(void);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_BPOBJ_H
@@ -77,7 +78,6 @@
 
 int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
 int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
-int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
 
 void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
 void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -19,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_BPTREE_H
@@ -43,7 +44,7 @@
 typedef struct bptree_entry_phys {
 	blkptr_t be_bp;
 	uint64_t be_birth_txg; /* only delete blocks born after this txg */
-	zbookmark_t be_zb; /* holds traversal resume point if needed */
+	zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
 } bptree_entry_phys_t;
 
 typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
@@ -50,6 +51,7 @@
 
 uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
 int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
 
 void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,8 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -35,6 +37,7 @@
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/zrlock.h>
+#include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -66,8 +69,13 @@
  *		|			 |
  *		|			 |
  *		+--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
+	DB_SEARCH = -1,
 	DB_UNCACHED,
 	DB_FILL,
 	DB_NOFILL,
@@ -112,6 +120,12 @@
 	/* pointer to parent dirty record */
 	struct dbuf_dirty_record *dr_parent;
 
+	/* How much space was changed to dsl_pool_dirty_space() for this? */
+	unsigned int dr_accounted;
+
+	/* A copy of the bp that points to us */
+	blkptr_t dr_bp_copy;
+
 	union dirty_types {
 		struct dirty_indirect {
 
@@ -214,18 +228,37 @@
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
-	list_node_t db_link;
+	avl_node_t db_link;
 
+	/*
+	 * Link in dbuf_cache.
+	 */
+	multilist_node_t db_cache_link;
+
 	/* Data which is unique to data (leaf) blocks: */
 
-	/* stuff we store for the user (see dmu_buf_set_user) */
-	void *db_user_ptr;
-	void **db_user_data_ptr_ptr;
-	dmu_buf_evict_func_t *db_evict_func;
+	/* User callback information. */
+	dmu_buf_user_t *db_user;
 
-	uint8_t db_immediate_evict;
+	/*
+	 * Evict user data as soon as the dirty and reference
+	 * counts are equal.
+	 */
+	uint8_t db_user_immediate_evict;
+
+	/*
+	 * This block was freed while a read or write was
+	 * active.
+	 */
 	uint8_t db_freed_in_flight;
 
+	/*
+	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+	 * evict this dbuf, but couldn't due to outstanding
+	 * references.  Evict once the refcount drops to 0.
+	 */
+	uint8_t db_pending_evict;
+
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
@@ -238,9 +271,8 @@
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
-
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
@@ -251,22 +283,25 @@
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp);
 
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+    uint64_t blkid, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+    uint64_t blkid);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
@@ -273,13 +308,15 @@
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
+void dbuf_destroy(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
@@ -292,20 +329,6 @@
 #define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
-#define	DB_GET_SPA(_spa_p, _db) {		\
-	dnode_t *__dn;				\
-	DB_DNODE_ENTER(_db);			\
-	__dn = DB_DNODE(_db);			\
-	*(_spa_p) = __dn->dn_objset->os_spa;	\
-	DB_DNODE_EXIT(_db);			\
-}
-#define	DB_GET_OBJSET(_os_p, _db) {		\
-	dnode_t *__dn;				\
-	DB_DNODE_ENTER(_db);			\
-	__dn = DB_DNODE(_db);			\
-	*(_os_p) = __dn->dn_objset;		\
-	DB_DNODE_EXIT(_db);			\
-}
 
 void dbuf_init(void);
 void dbuf_fini(void);
@@ -325,8 +348,11 @@
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
-#define	DBUF_IS_L2COMPRESSIBLE(_db)					\
-	((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF)
+#define	DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)				\
+	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
+	(((_level) > 0 ||						\
+	DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&	\
+	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
@@ -354,7 +380,7 @@
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, bp);				\
+	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	}							\

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -172,7 +173,7 @@
 extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
     enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde);
 extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class, uint64_t *count);
+    enum ddt_class cls, uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
     enum ddt_class cls, dmu_object_info_t *);
 extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,9 +22,14 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -39,11 +45,10 @@
  * dmu_spa.h.
  */
 
-#include <sys/types.h>
-#include <sys/param.h>
+#include <sys/zfs_context.h>
 #include <sys/cred.h>
-#include <sys/time.h>
 #include <sys/fs/zfs.h>
+#include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -63,7 +68,7 @@
 struct dnode;
 struct drr_begin;
 struct drr_end;
-struct zbookmark;
+struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
@@ -74,6 +79,7 @@
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
@@ -118,6 +124,14 @@
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define	DMU_OT_HAS_FILL(ot) \
+	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	dmu_ot[(ot)].ot_byteswap)
@@ -217,10 +231,14 @@
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
-typedef enum txg_how {
-	TXG_WAIT = 1,
-	TXG_NOWAIT,
-} txg_how_t;
+/*
+ * These flags are intended to be used to specify the "txg_how"
+ * parameter when calling the dmu_tx_assign() function. See the comment
+ * above dmu_tx_assign() for more details on the meaning of these flags.
+ */
+#define	TXG_NOWAIT	(0ULL)
+#define	TXG_WAIT	(1ULL<<0)
+#define	TXG_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
@@ -233,17 +251,17 @@
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
+#define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
-#define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
-#define	DMU_DEADLIST_OBJECT	(-3ULL)
 
 /*
  * artificial blkids for bonus buffer and spill blocks
@@ -283,8 +301,6 @@
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
@@ -293,6 +309,7 @@
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
+#define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
@@ -310,6 +327,7 @@
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
+#define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -331,7 +349,7 @@
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen);
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
@@ -393,6 +411,11 @@
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx);
+
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
@@ -400,7 +423,7 @@
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
-void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
@@ -426,7 +449,7 @@
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
@@ -446,7 +469,25 @@
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+    uint64_t blkid, void *tag);
+
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
@@ -461,46 +502,134 @@
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+    uint64_t length, boolean_t read, void *tag,
+    int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
 /*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime.  This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted.  Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
  *
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility.  All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data.  This allows conversions between the two types
+ * with a simple cast.  Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
  *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it.  Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
  *
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds).  Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+	/*
+	 * Asynchronous user eviction callback state.
+	 */
+	taskq_ent_t	dbu_tqent;
+
+	/* This instance's eviction function pointer. */
+	dmu_buf_evict_func_t *dbu_evict_func;
+#ifdef ZFS_DEBUG
+	/*
+	 * Pointer to user's dbuf pointer.  NULL for clients that do
+	 * not associate a dbuf with their user data.
+	 *
+	 * The dbuf pointer is cleared upon eviction so as to catch
+	 * use-after-evict bugs in clients.
+	 */
+	dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
  *
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp)
+{
+	ASSERT(dbu->dbu_evict_func == NULL);
+	ASSERT(evict_func != NULL);
+	dbu->dbu_evict_func = evict_func;
+#ifdef ZFS_DEBUG
+	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
 /*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
  */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
-    void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
  */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
+ *
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
+ */
+void *dmu_buf_replace_user(dmu_buf_t *db,
+    dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
+/*
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
+ */
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
+ */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
+objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
+
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
@@ -551,9 +680,10 @@
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
@@ -583,7 +713,7 @@
 	uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size);
-int dmu_free_object(objset_t *os, uint64_t object);
+int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
@@ -600,12 +730,20 @@
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
+#ifdef _KERNEL
+#ifdef illumos
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
+#else
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+#endif
+#endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
@@ -620,13 +758,14 @@
 void xuio_stat_wbuf_copied();
 void xuio_stat_wbuf_nocopy();
 
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
+extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t len);
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
@@ -638,7 +777,8 @@
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
-	uint8_t doi_pad[5];
+	uint8_t doi_nblkptr;
+	uint8_t doi_pad[4];
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
@@ -669,7 +809,7 @@
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
@@ -686,7 +826,7 @@
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
-	char dds_origin[MAXNAMELEN];
+	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
@@ -736,8 +876,8 @@
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
-extern uint64_t dmu_objset_syncprop(objset_t *os);
-extern uint64_t dmu_objset_logbias(objset_t *os);
+extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
+extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -790,6 +930,15 @@
     uint64_t *off);
 
 /*
+ * Check if a DMU object has any dirty blocks. If so, sync out
+ * all pending transaction groups. Otherwise, this function
+ * does not alter DMU state. This could be improved to only sync
+ * out the necessary transaction groups for this particular
+ * object.
+ */
+int dmu_object_wait_synced(objset_t *os, uint64_t object);
+
+/*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
@@ -806,6 +955,8 @@
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
+extern int zfs_mdcomp_disable;
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,8 +22,11 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ */
+/*
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DMU_IMPL_H
@@ -293,8 +297,19 @@
 	uint64_t dsa_toguid;
 	int dsa_err;
 	dmu_pendop_t dsa_pending_op;
+	uint64_t dsa_featureflags;
+	uint64_t dsa_last_data_object;
+	uint64_t dsa_last_data_offset;
+	uint64_t dsa_resume_object;
+	uint64_t dsa_resume_offset;
+	boolean_t dsa_sent_begin;
+	boolean_t dsa_sent_end;
 } dmu_sendarg_t;
 
+void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
+void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+    void *, dmu_buf_t **);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,8 +21,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -74,10 +77,10 @@
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	/*
-	 * The following "special" dnodes have no parent and are exempt from
-	 * dnode_move(), but they root their descendents in this objset using
-	 * handles anyway, so that all access to dnodes from dbufs consistently
-	 * uses handles.
+	 * The following "special" dnodes have no parent, are exempt
+	 * from dnode_move(), and are not recorded in os_dnodes, but they
+	 * root their descendents in this objset using handles anyway, so
+	 * that all access to dnodes from dbufs consistently uses handles.
 	 */
 	dnode_handle_t os_meta_dnode;
 	dnode_handle_t os_userused_dnode;
@@ -84,20 +87,29 @@
 	dnode_handle_t os_groupused_dnode;
 	zilog_t *os_zil;
 
+	list_node_t os_evicting_node;
+
 	/* can change, under dsl_dir's locks: */
-	uint8_t os_checksum;
-	uint8_t os_compress;
+	enum zio_checksum os_checksum;
+	enum zio_compress os_compress;
 	uint8_t os_copies;
-	uint8_t os_dedup_checksum;
-	uint8_t os_dedup_verify;
-	uint8_t os_logbias;
-	uint8_t os_primary_cache;
-	uint8_t os_secondary_cache;
-	uint8_t os_sync;
+	enum zio_checksum os_dedup_checksum;
+	boolean_t os_dedup_verify;
+	zfs_logbias_op_t os_logbias;
+	zfs_cache_type_t os_primary_cache;
+	zfs_cache_type_t os_secondary_cache;
+	zfs_sync_type_t os_sync;
+	zfs_redundant_metadata_type_t os_redundant_metadata;
+	int os_recordsize;
 
+	/*
+	 * Pointer is constant; the blkptr it points to is protected by
+	 * os_dsl_dataset->ds_bp_rwlock
+	 */
+	blkptr_t *os_rootbp;
+
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
-	blkptr_t *os_rootbp;
 	zil_header_t os_zil_header;
 	list_t os_synced_dnodes;
 	uint64_t os_flags;
@@ -130,12 +142,16 @@
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
-#define	DMU_OS_IS_L2COMPRESSIBLE(os)	((os)->os_compress != ZIO_COMPRESS_OFF)
+#define	DMU_OS_IS_L2COMPRESSIBLE(os)	(zfs_mdcomp_disable == B_FALSE)
 
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
+    struct dsl_dataset **newds, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
@@ -167,6 +183,8 @@
 boolean_t dmu_objset_userspace_present(objset_t *os);
 int dmu_fsname(const char *snapname, char *buf);
 
+void dmu_objset_evict_done(objset_t *os);
+
 void dmu_objset_init(void);
 void dmu_objset_fini(void);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,9 +22,10 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _DMU_SEND_H
@@ -35,8 +37,12 @@
 struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
+struct dmu_replay_record;
 
-int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
 #ifdef illumos
     struct vnode *vp, offset_t *off);
 #else
@@ -44,7 +50,10 @@
 #endif
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
+int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
+    uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, struct vnode *vp, offset_t *off);
 #else
@@ -53,6 +62,7 @@
 
 typedef struct dmu_recv_cookie {
 	struct dsl_dataset *drc_ds;
+	struct dmu_replay_record *drc_drr_begin;
 	struct drr_begin *drc_drrb;
 	const char *drc_tofs;
 	const char *drc_tosnap;
@@ -59,13 +69,17 @@
 	boolean_t drc_newfs;
 	boolean_t drc_byteswap;
 	boolean_t drc_force;
+	boolean_t drc_resumable;
 	struct avl_tree *drc_guid_to_ds_map;
 	zio_cksum_t drc_cksum;
 	uint64_t drc_newsnapobj;
+	void *drc_owner;
+	cred_t *drc_cred;
 } dmu_recv_cookie_t;
 
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_begin(char *tofs, char *tosnap,
+    struct dmu_replay_record *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
 #ifdef illumos
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
 #else
@@ -72,6 +86,7 @@
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
 #endif
     int cleanup_fd, uint64_t *action_handlep);
-int dmu_recv_end(dmu_recv_cookie_t *drc);
+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
+boolean_t dmu_objset_is_receiving(objset_t *os);
 
 #endif /* _DMU_SEND_H */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DMU_TRAVERSE_H
@@ -40,7 +41,7 @@
 struct arc_buf;
 
 typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg);
 
 #define	TRAVERSE_PRE			(1<<0)
 #define	TRAVERSE_POST			(1<<1)
@@ -54,8 +55,10 @@
 
 int traverse_dataset(struct dsl_dataset *ds,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
 int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
-    uint64_t txg_start, zbookmark_t *resume, int flags,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg);
 int traverse_pool(spa_t *spa,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -23,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DMU_TX_H
@@ -59,8 +60,22 @@
 	txg_handle_t tx_txgh;
 	void *tx_tempreserve_cookie;
 	struct dmu_tx_hold *tx_needassign_txh;
-	list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
-	uint8_t tx_anyobj;
+
+	/* list of dmu_tx_callback_t on this dmu_tx */
+	list_t tx_callbacks;
+
+	/* placeholder for syncing context, doesn't need specific holds */
+	boolean_t tx_anyobj;
+
+	/* time this transaction was created */
+	hrtime_t tx_start;
+
+	/* need to wait for sufficient dirty space */
+	boolean_t tx_wait_dirty;
+
+	/* has this transaction already been delayed? */
+	boolean_t tx_dirty_delayed;
+
 	int tx_err;
 #ifdef ZFS_DEBUG
 	uint64_t tx_space_towrite;
@@ -87,12 +102,12 @@
 	dmu_tx_t *txh_tx;
 	list_node_t txh_node;
 	struct dnode *txh_dnode;
-	uint64_t txh_space_towrite;
-	uint64_t txh_space_tofree;
-	uint64_t txh_space_tooverwrite;
-	uint64_t txh_space_tounref;
-	uint64_t txh_memory_tohold;
-	uint64_t txh_fudge;
+	refcount_t txh_space_towrite;
+	refcount_t txh_space_tofree;
+	refcount_t txh_space_tooverwrite;
+	refcount_t txh_space_tounref;
+	refcount_t txh_memory_tohold;
+	refcount_t txh_fudge;
 #ifdef ZFS_DEBUG
 	enum dmu_tx_hold_type txh_type;
 	uint64_t txh_arg1;
@@ -110,7 +125,7 @@
  * These routines are defined in dmu.h, and are called by the user.
  */
 dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -23,9 +24,13 @@
  * Use is subject to license terms.
  */
 
-#ifndef	_DFETCH_H
-#define	_DFETCH_H
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
 
+#ifndef	_DMU_ZFETCH_H
+#define	_DMU_ZFETCH_H
+
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -36,29 +41,25 @@
 
 struct dnode;				/* so we can reference dnode */
 
-typedef enum zfetch_dirn {
-	ZFETCH_FORWARD = 1,		/* prefetch increasing block numbers */
-	ZFETCH_BACKWARD	= -1		/* prefetch decreasing block numbers */
-} zfetch_dirn_t;
+typedef struct zstream {
+	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint64_t	zs_pf_blkid;	/* next block to prefetch */
 
-typedef struct zstream {
-	uint64_t	zst_offset;	/* offset of starting block in range */
-	uint64_t	zst_len;	/* length of range, in blocks */
-	zfetch_dirn_t	zst_direction;	/* direction of prefetch */
-	uint64_t	zst_stride;	/* length of stride, in blocks */
-	uint64_t	zst_ph_offset;	/* prefetch offset, in blocks */
-	uint64_t	zst_cap;	/* prefetch limit (cap), in blocks */
-	kmutex_t	zst_lock;	/* protects stream */
-	clock_t		zst_last;	/* lbolt of last prefetch */
-	avl_node_t	zst_node;	/* embed avl node here */
+	/*
+	 * We will next prefetch the L1 indirect block of this level-0
+	 * block id.
+	 */
+	uint64_t	zs_ipf_blkid;
+
+	kmutex_t	zs_lock;	/* protects stream */
+	hrtime_t	zs_atime;	/* time last prefetch issued */
+	list_node_t	zs_node;	/* link for zf_stream */
 } zstream_t;
 
 typedef struct zfetch {
 	krwlock_t	zf_rwlock;	/* protects zfetch structure */
-	list_t		zf_stream;	/* AVL tree of zstream_t's */
+	list_t		zf_stream;	/* list of zstream_t's */
 	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
-	uint32_t	zf_stream_cnt;	/* # of active streams */
-	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
 } zfetch_t;
 
 void		zfetch_init(void);
@@ -65,8 +66,8 @@
 void		zfetch_fini(void);
 
 void		dmu_zfetch_init(zfetch_t *, struct dnode *);
-void		dmu_zfetch_rele(zfetch_t *);
-void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+void		dmu_zfetch_fini(zfetch_t *);
+void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
 
 
 #ifdef	__cplusplus
@@ -73,4 +74,4 @@
 }
 #endif
 
-#endif	/* _DFETCH_H */
+#endif	/* _DMU_ZFETCH_H */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DNODE_H
@@ -56,8 +58,8 @@
  * Fixed constants.
  */
 #define	DNODE_SHIFT		9	/* 512 bytes */
-#define	DN_MIN_INDBLKSHIFT	10	/* 1k */
-#define	DN_MAX_INDBLKSHIFT	14	/* 16k */
+#define	DN_MIN_INDBLKSHIFT	12	/* 4k */
+#define	DN_MAX_INDBLKSHIFT	17	/* 128k */
 #define	DNODE_BLOCK_SHIFT	14	/* 16k */
 #define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
 #define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
@@ -87,6 +89,11 @@
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
+
+/*
+ * This is inaccurate if the indblkshift of the particular object is not the
+ * max.  But it's only used by userland to calculate the zvol reservation.
+ */
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
 #define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
 
@@ -143,7 +150,7 @@
 	blkptr_t dn_spill;
 } dnode_phys_t;
 
-typedef struct dnode {
+struct dnode {
 	/*
 	 * Protects the structure of the dnode, including the number of levels
 	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@@ -178,6 +185,7 @@
 	uint16_t dn_datablkszsec;	/* in 512b sectors */
 	uint32_t dn_datablksz;		/* in bytes */
 	uint64_t dn_maxblkid;
+	uint8_t dn_next_type[TXG_SIZE];
 	uint8_t dn_next_nblkptr[TXG_SIZE];
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
@@ -188,6 +196,8 @@
 
 	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
 	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
+	/* There are no level-0 blocks of this blkid or higher in dn_dbufs */
+	uint64_t dn_unlisted_l0_blkid;
 
 	/* protected by os_lock: */
 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
@@ -195,7 +205,7 @@
 	/* protected by dn_mtx: */
 	kmutex_t dn_mtx;
 	list_t dn_dirty_records[TXG_SIZE];
-	avl_tree_t dn_ranges[TXG_SIZE];
+	struct range_tree *dn_free_ranges[TXG_SIZE];
 	uint64_t dn_allocated_txg;
 	uint64_t dn_free_txg;
 	uint64_t dn_assigned_txg;
@@ -208,7 +218,18 @@
 	refcount_t dn_holds;
 
 	kmutex_t dn_dbufs_mtx;
-	list_t dn_dbufs;		/* descendent dbufs */
+	/*
+	 * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
+	 * can contain multiple dbufs of the same (level, blkid) when a
+	 * dbuf is marked DB_EVICTING without being removed from
+	 * dn_dbufs. To maintain the avl invariant that there cannot be
+	 * duplicate entries, we order the dbufs by an arbitrary value -
+	 * their address in memory. This means that dn_dbufs cannot be used to
+	 * directly look up a dbuf. Instead, callers must use avl_walk, have
+	 * a reference to the dbuf, or look up a non-existant node with
+	 * db_state = DB_SEARCH (see dbuf_free_range for an example).
+	 */
+	avl_tree_t dn_dbufs;
 
 	/* protected by dn_struct_rwlock */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
@@ -227,7 +248,7 @@
 
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
-} dnode_t;
+};
 
 /*
  * Adds a level of indirection between the dbuf and the dnode to avoid
@@ -241,8 +262,9 @@
 } dnode_handle_t;
 
 typedef struct dnode_children {
+	dmu_buf_user_t dnc_dbu;		/* User evict data */
 	size_t dnc_count;		/* number of children */
-	dnode_handle_t dnc_children[1];	/* sized dynamically */
+	dnode_handle_t dnc_children[];	/* sized dynamically */
 } dnode_children_t;
 
 typedef struct free_range {
@@ -251,7 +273,7 @@
 	uint64_t fr_nblks;
 } free_range_t;
 
-dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
     uint64_t object, dnode_handle_t *dnh);
 void dnode_special_close(dnode_handle_t *dnh);
 
@@ -265,6 +287,7 @@
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
@@ -277,8 +300,6 @@
 void dnode_verify(dnode_t *dn);
 int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
 void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_clear_range(dnode_t *dn, uint64_t blkid,
-    uint64_t nblks, dmu_tx_t *tx);
 void dnode_diduse_space(dnode_t *dn, int64_t space);
 void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
 void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
@@ -288,7 +309,17 @@
 int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
     int minlvl, uint64_t blkfill, uint64_t txg);
 void dnode_evict_dbufs(dnode_t *dn);
+void dnode_evict_bonus(dnode_t *dn);
 
+#define	DNODE_IS_CACHEABLE(_dn)						\
+	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
+	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
+
+#define	DNODE_META_IS_CACHEABLE(_dn)					\
+	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
+
 #ifdef ZFS_DEBUG
 
 /*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,9 +21,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_DSL_DATASET_H
@@ -37,6 +40,8 @@
 #include <sys/zfs_context.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/refcount.h>
+#include <sys/rrwlock.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -48,10 +53,10 @@
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
+
 /*
- * Note: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later.
+ * Do not allow this dataset to be promoted.
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
@@ -68,9 +73,33 @@
  */
 #define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
 #define	DS_IS_DEFER_DESTROY(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
 
 /*
+ * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+/*
+ * This field's value is the object ID of a zap object which contains the
+ * bookmarks of this dataset.  If it is present, then this dataset is counted
+ * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
+ */
+#define	DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
+
+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define	DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define	DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define	DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define	DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define	DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define	DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define	DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+
+/*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
  */
@@ -113,15 +142,19 @@
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
+	dmu_buf_user_t ds_dbu;
+	rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
+
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
-	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 	uint64_t ds_fsid_guid;
+	boolean_t ds_is_snapshot;
 
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
+	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
 
 	/* has internal locking: */
 	dsl_deadlist_t ds_deadlist;
@@ -160,10 +193,39 @@
 	kmutex_t ds_sendstream_lock;
 	list_t ds_sendstreams;
 
+	/*
+	 * When in the middle of a resumable receive, tracks how much
+	 * progress we have made.
+	 */
+	uint64_t ds_resume_object[TXG_SIZE];
+	uint64_t ds_resume_offset[TXG_SIZE];
+	uint64_t ds_resume_bytes[TXG_SIZE];
+
+	/* Protected by our dsl_dir's dd_lock */
+	list_t ds_prop_cbs;
+
+	/*
+	 * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+	 * uses this feature.
+	 */
+	uint8_t ds_feature_inuse[SPA_FEATURES];
+
+	/*
+	 * Set if we need to activate the feature on this dataset this txg
+	 * (used only in syncing context).
+	 */
+	uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
 	/* Protected by ds_lock; keep at end of struct for better locality */
-	char ds_snapname[MAXNAMELEN];
+	char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
 } dsl_dataset_t;
 
+inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+	return (ds->ds_dbuf->db_data);
+}
+
 /*
  * The max length of a temporary tag prefix is the number of hex digits
  * required to express UINT64_MAX plus one for the hyphen.
@@ -171,13 +233,15 @@
 #define	MAX_TAG_PREFIX_LEN	17
 
 #define	dsl_dataset_is_snapshot(ds) \
-	((ds)->ds_phys->ds_num_children != 0)
+	(dsl_dataset_phys(ds)->ds_num_children != 0)
 
 #define	DS_UNIQUE_IS_ACCURATE(ds)	\
-	(((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
 int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
     dsl_dataset_t **dsp);
+boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
+    void *tag);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
@@ -188,6 +252,8 @@
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_namelen(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -202,13 +268,14 @@
     minor_t cleanup_minor, const char *htag);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
-boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
+    dsl_dataset_t *snap);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
@@ -241,17 +308,18 @@
 int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
-boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier);
+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+    uint64_t earlier_txg);
 void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
 
 int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force);
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
 void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx);
 int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx);
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
 void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx);
 
@@ -261,18 +329,25 @@
 int dsl_dataset_get_snapname(dsl_dataset_t *ds);
 int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
     uint64_t *value);
-int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt);
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx);
-int dsl_dataset_rollback(const char *fsname);
+void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
+int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
 
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+    spa_feature_t f, dmu_tx_t *tx);
+
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN); \
+	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DELEG_H
@@ -56,6 +57,7 @@
 #define	ZFS_DELEG_PERM_HOLD		"hold"
 #define	ZFS_DELEG_PERM_RELEASE		"release"
 #define	ZFS_DELEG_PERM_DIFF		"diff"
+#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
 
 /*
  * Note: the names of properties that are marked delegatable are also

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
@@ -35,15 +36,16 @@
 struct dsl_dataset;
 struct dmu_tx;
 
-int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
-    struct nvlist *errlist);
-int dsl_destroy_snapshot(const char *name, boolean_t defer);
-int dsl_destroy_head(const char *name);
-int dsl_destroy_head_check_impl(struct dsl_dataset *ds, int expected_holds);
-void dsl_destroy_head_sync_impl(struct dsl_dataset *ds, struct dmu_tx *tx);
-int dsl_destroy_inconsistent(const char *dsname, void *arg);
-void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *ds,
-    boolean_t defer, struct dmu_tx *tx);
+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
+    struct nvlist *);
+int dsl_destroy_snapshot(const char *, boolean_t);
+int dsl_destroy_head(const char *);
+int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *);
+int dsl_destroy_inconsistent(const char *, void *);
+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
+    boolean_t, struct dmu_tx *);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DIR_H
@@ -38,6 +41,14 @@
 
 struct dsl_dataset;
 
+/*
+ * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+#define	DD_FIELD_FILESYSTEM_COUNT	"com.joyent:filesystem_count"
+#define	DD_FIELD_SNAPSHOT_COUNT		"com.joyent:snapshot_count"
+
 typedef enum dd_used {
 	DD_USED_HEAD,
 	DD_USED_SNAP,
@@ -75,12 +86,15 @@
 } dsl_dir_phys_t;
 
 struct dsl_dir {
+	dmu_buf_user_t dd_dbu;
+
 	/* These are immutable; no lock needed: */
 	uint64_t dd_object;
-	dsl_dir_phys_t *dd_phys;
-	dmu_buf_t *dd_dbuf;
 	dsl_pool_t *dd_pool;
 
+	/* Stable until user eviction; no lock needed: */
+	dmu_buf_t *dd_dbuf;
+
 	/* protected by lock on pool's dp_dirty_dirs list */
 	txg_node_t dd_dirty_link;
 
@@ -89,7 +103,7 @@
 
 	/* Protected by dd_lock */
 	kmutex_t dd_lock;
-	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	list_t dd_props; /* list of dsl_prop_record_t's */
 	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
 	uint64_t dd_origin_txg;
 
@@ -99,10 +113,17 @@
 	int64_t dd_space_towrite[TXG_SIZE];
 
 	/* protected by dd_lock; keep at end of struct for better locality */
-	char dd_myname[MAXNAMELEN];
+	char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
 };
 
+inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+	return (dd->dd_dbuf->db_data);
+}
+
 void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
 int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **, const char **tail);
 int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
@@ -129,8 +150,13 @@
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
+int dsl_dir_activate_fs_ss_limit(const char *);
+int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
+    cred_t *);
+void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
 int dsl_dir_rename(const char *oldname, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
@@ -138,6 +164,8 @@
 timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
     dmu_tx_t *tx);
+void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
@@ -144,15 +172,15 @@
 #define	ORIGIN_DIR_NAME "$ORIGIN"
 #define	XLATION_DIR_NAME "$XLATION"
 #define	FREE_DIR_NAME "$FREE"
+#define	LEAK_DIR_NAME "$LEAK"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
-	    KM_SLEEP); \
+	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
 	dsl_dir_name(dd, __ds_name); \
 	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_POOL_H
@@ -49,6 +50,13 @@
 struct dmu_tx;
 struct dsl_scan;
 
+extern uint64_t zfs_dirty_data_max;
+extern uint64_t zfs_dirty_data_max_max;
+extern uint64_t zfs_dirty_data_sync;
+extern int zfs_dirty_data_max_percent;
+extern int zfs_delay_min_dirty_percent;
+extern uint64_t zfs_delay_scale;
+
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define	DMU_OT_DEFERRED	DMU_OT_NONE
 #define	DMU_OT_OTHER	DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
@@ -77,6 +85,7 @@
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
 	struct dsl_dir *dp_free_dir;
+	struct dsl_dir *dp_leak_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_vnrele_taskq;
@@ -83,9 +92,6 @@
 
 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
-	hrtime_t dp_read_overhead;
-	uint64_t dp_throughput; /* bytes per millisec */
-	uint64_t dp_write_limit;
 	uint64_t dp_tmp_userrefs_obj;
 	bpobj_t dp_free_bpobj;
 	uint64_t dp_bptree_obj;
@@ -95,12 +101,19 @@
 
 	/* Uses dp_lock */
 	kmutex_t dp_lock;
-	uint64_t dp_space_towrite[TXG_SIZE];
-	uint64_t dp_tempreserved[TXG_SIZE];
+	kcondvar_t dp_spaceavail_cv;
+	uint64_t dp_dirty_pertxg[TXG_SIZE];
+	uint64_t dp_dirty_total;
 	uint64_t dp_mos_used_delta;
 	uint64_t dp_mos_compressed_delta;
 	uint64_t dp_mos_uncompressed_delta;
 
+	/*
+	 * Time of most recently scheduled (furthest in the future)
+	 * wakeup for delayed transactions.
+	 */
+	hrtime_t dp_last_wakeup;
+
 	/* Has its own locking */
 	tx_state_t dp_tx;
 	txg_list_t dp_dirty_datasets;
@@ -129,10 +142,8 @@
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
 uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
-int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
-void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-void dsl_pool_memory_pressure(dsl_pool_t *dp);
-void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
 void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
     const blkptr_t *bpp);
@@ -142,8 +153,11 @@
 void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp);
 void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
 boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -41,10 +42,17 @@
 /* The callback func may not call into the DMU or DSL! */
 typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
 
+typedef struct dsl_prop_record {
+	list_node_t pr_node; /* link on dd_props */
+	const char *pr_propname;
+	list_t pr_cbs;
+} dsl_prop_record_t;
+
 typedef struct dsl_prop_cb_record {
-	list_node_t cbr_node; /* link on dd_prop_cbs */
+	list_node_t cbr_pr_node; /* link on pr_cbs */
+	list_node_t cbr_ds_node; /* link on ds_prop_cbs */
+	dsl_prop_record_t *cbr_pr;
 	struct dsl_dataset *cbr_ds;
-	const char *cbr_propname;
 	dsl_prop_changed_cb_t *cbr_func;
 	void *cbr_arg;
 } dsl_prop_cb_record_t;
@@ -54,10 +62,11 @@
 	zprop_source_t pa_source;
 } dsl_props_arg_t;
 
+void dsl_prop_init(dsl_dir_t *dd);
+void dsl_prop_fini(dsl_dir_t *dd);
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg);
+void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
 void dsl_prop_notify_all(struct dsl_dir *dd);
 boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_SCAN_H
@@ -62,7 +63,7 @@
 	uint64_t scn_errors;	/* scan I/O error count */
 	uint64_t scn_ddt_class_max;
 	ddt_bookmark_t scn_ddt_bookmark;
-	zbookmark_t scn_bookmark;
+	zbookmark_phys_t scn_bookmark;
 	uint64_t scn_flags; /* dsl_scan_flags_t */
 } dsl_scan_phys_t;
 
@@ -72,11 +73,42 @@
 	DSF_VISIT_DS_AGAIN = 1<<0,
 } dsl_scan_flags_t;
 
+/*
+ * Every pool will have one dsl_scan_t and this structure will contain
+ * in-memory information about the scan and a pointer to the on-disk
+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
+ * is contained on-disk to allow the scan to resume in the event of a reboot
+ * or panic. This structure maintains information about the behavior of a
+ * running scan, some caching information, and how it should traverse the pool.
+ *
+ * The following members of this structure direct the behavior of the scan:
+ *
+ * scn_pausing -	a scan that cannot be completed in a single txg or
+ *			has exceeded its allotted time will need to pause.
+ *			When this flag is set the scanner will stop traversing
+ *			the pool and write out the current state to disk.
+ *
+ * scn_restart_txg -	directs the scanner to either restart or start a
+ *			a scan at the specified txg value.
+ *
+ * scn_done_txg -	when a scan completes its traversal it will set
+ *			the completion txg to the next txg. This is necessary
+ *			to ensure that any blocks that were freed during
+ *			the scan but have not yet been processed (i.e deferred
+ *			frees) are accounted for.
+ *
+ * This structure also maintains information about deferred frees which are
+ * a special kind of traversal. Deferred free can exist in either a bptree or
+ * a bpobj structure. The scn_is_bptree flag will indicate the type of
+ * deferred free that is in progress. If the deferred free is part of an
+ * asynchronous destroy then the scn_async_destroying flag will be set.
+ */
 typedef struct dsl_scan {
 	struct dsl_pool *scn_dp;
 
 	boolean_t scn_pausing;
 	uint64_t scn_restart_txg;
+	uint64_t scn_done_txg;
 	uint64_t scn_sync_start_time;
 	zio_t *scn_zio_root;
 
@@ -83,6 +115,7 @@
 	/* for freeing blocks */
 	boolean_t scn_is_bptree;
 	boolean_t scn_async_destroying;
+	boolean_t scn_async_stalled;
 
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_SYNCTASK_H
@@ -38,11 +39,41 @@
 typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
 typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
 
+typedef enum zfs_space_check {
+	/*
+	 * Normal space check: if there is less than 3.2% free space,
+	 * the operation will fail.  Operations which are logically
+	 * creating things should use this (e.g. "zfs create", "zfs snapshot").
+	 * User writes (via the ZPL / ZVOL) also fail at this point.
+	 */
+	ZFS_SPACE_CHECK_NORMAL,
+
+	/*
+	 * Space check allows use of half the slop space.  If there
+	 * is less than 1.6% free space, the operation will fail.  Most
+	 * operations should use this (e.g. "zfs set", "zfs rename"),
+	 * because we want them to succeed even after user writes are failing,
+	 * so that they can be used as part of the space recovery process.
+	 */
+	ZFS_SPACE_CHECK_RESERVED,
+
+	/*
+	 * No space check is performed.  Only operations which we expect to
+	 * result in a net reduction in space should use this
+	 * (e.g. "zfs destroy". Setting quotas & reservations also uses
+	 * this because it needs to circumvent the quota/reservation checks).
+	 *
+	 * See also the comments above spa_slop_shift.
+	 */
+	ZFS_SPACE_CHECK_NONE,
+} zfs_space_check_t;
+
 typedef struct dsl_sync_task {
 	txg_node_t dst_node;
 	struct dsl_pool *dst_pool;
 	uint64_t dst_txg;
 	int dst_space;
+	zfs_space_check_t dst_space_check;
 	dsl_checkfunc_t *dst_checkfunc;
 	dsl_syncfunc_t *dst_syncfunc;
 	void *dst_arg;
@@ -50,11 +81,11 @@
 	boolean_t dst_nowaiter;
 } dsl_sync_task_t;
 
-void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx);
-int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified);
-void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc,
-    void *arg, int blocks_modified, dmu_tx_t *tx);
+void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
+int dsl_sync_task(const char *, dsl_checkfunc_t *,
+    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+    void *, int, zfs_space_check_t, dmu_tx_t *);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 
 /*
  * CDDL HEADER START

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -36,47 +37,68 @@
 extern "C" {
 #endif
 
-extern space_map_ops_t *zfs_metaslab_ops;
+typedef struct metaslab_ops {
+	uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
+} metaslab_ops_t;
 
-extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-    uint64_t start, uint64_t size, uint64_t txg);
-extern void metaslab_fini(metaslab_t *msp);
-extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_reassess(metaslab_group_t *mg);
+extern metaslab_ops_t *zfs_metaslab_ops;
 
-#define	METASLAB_HINTBP_FAVOR	0x0
-#define	METASLAB_HINTBP_AVOID	0x1
-#define	METASLAB_GANG_HEADER	0x2
-#define	METASLAB_GANG_CHILD	0x4
-#define	METASLAB_GANG_AVOID	0x8
+int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
+    metaslab_t **);
+void metaslab_fini(metaslab_t *);
 
-extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
-    blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
-extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
-    boolean_t now);
-extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
+void metaslab_load_wait(metaslab_t *);
+int metaslab_load(metaslab_t *);
+void metaslab_unload(metaslab_t *);
 
-extern metaslab_class_t *metaslab_class_create(spa_t *spa,
-    space_map_ops_t *ops);
-extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern int metaslab_class_validate(metaslab_class_t *mc);
+void metaslab_sync(metaslab_t *, uint64_t);
+void metaslab_sync_done(metaslab_t *, uint64_t);
+void metaslab_sync_reassess(metaslab_group_t *);
+uint64_t metaslab_block_maxsize(metaslab_t *);
 
-extern void metaslab_class_space_update(metaslab_class_t *mc,
-    int64_t alloc_delta, int64_t defer_delta,
-    int64_t space_delta, int64_t dspace_delta);
-extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+#define	METASLAB_HINTBP_FAVOR		0x0
+#define	METASLAB_HINTBP_AVOID		0x1
+#define	METASLAB_GANG_HEADER		0x2
+#define	METASLAB_GANG_CHILD		0x4
+#define	METASLAB_ASYNC_ALLOC		0x8
+#define	METASLAB_DONT_THROTTLE		0x10
 
-extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
-    vdev_t *vd);
-extern void metaslab_group_destroy(metaslab_group_t *mg);
-extern void metaslab_group_activate(metaslab_group_t *mg);
-extern void metaslab_group_passivate(metaslab_group_t *mg);
+int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
+void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+void metaslab_check_free(spa_t *, const blkptr_t *);
 
+metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
+void metaslab_class_destroy(metaslab_class_t *);
+int metaslab_class_validate(metaslab_class_t *);
+void metaslab_class_histogram_verify(metaslab_class_t *);
+uint64_t metaslab_class_fragmentation(metaslab_class_t *);
+uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+    zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+
+void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
+    int64_t, int64_t);
+uint64_t metaslab_class_get_alloc(metaslab_class_t *);
+uint64_t metaslab_class_get_space(metaslab_class_t *);
+uint64_t metaslab_class_get_dspace(metaslab_class_t *);
+uint64_t metaslab_class_get_deferred(metaslab_class_t *);
+uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
+
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+void metaslab_group_destroy(metaslab_group_t *);
+void metaslab_group_activate(metaslab_group_t *);
+void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
+uint64_t metaslab_group_get_space(metaslab_group_t *);
+void metaslab_group_histogram_verify(metaslab_group_t *);
+uint64_t metaslab_group_fragmentation(metaslab_group_t *);
+void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -32,6 +33,7 @@
 
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
+#include <sys/range_tree.h>
 #include <sys/vdev.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
@@ -40,66 +42,216 @@
 extern "C" {
 #endif
 
+/*
+ * A metaslab class encompasses a category of allocatable top-level vdevs.
+ * Each top-level vdev is associated with a metaslab group which defines
+ * the allocatable region for that vdev. Examples of these categories include
+ * "normal" for data block allocations (i.e. main pool allocations) or "log"
+ * for allocations designated for intent log devices (i.e. slog devices).
+ * When a block allocation is requested from the SPA it is associated with a
+ * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
+ * to the class can be used to satisfy that request. Allocations are done
+ * by traversing the metaslab groups that are linked off of the mc_rotor field.
+ * This rotor points to the next metaslab group where allocations will be
+ * attempted. Allocating a block is a 3 step process -- select the metaslab
+ * group, select the metaslab, and then allocate the block. The metaslab
+ * class defines the low-level block allocator that will be used as the
+ * final step in allocation. These allocators are pluggable allowing each class
+ * to use a block allocator that best suits that class.
+ */
 struct metaslab_class {
+	kmutex_t		mc_lock;
 	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
-	space_map_ops_t		*mc_ops;
+	metaslab_ops_t		*mc_ops;
 	uint64_t		mc_aliquot;
+
+	/*
+	 * Track the number of metaslab groups that have been initialized
+	 * and can accept allocations. An initialized metaslab group is
+	 * one has been completely added to the config (i.e. we have
+	 * updated the MOS config and the space has been added to the pool).
+	 */
+	uint64_t		mc_groups;
+
+	/*
+	 * Toggle to enable/disable the allocation throttle.
+	 */
+	boolean_t		mc_alloc_throttle_enabled;
+
+	/*
+	 * The allocation throttle works on a reservation system. Whenever
+	 * an asynchronous zio wants to perform an allocation it must
+	 * first reserve the number of blocks that it wants to allocate.
+	 * If there aren't sufficient slots available for the pending zio
+	 * then that I/O is throttled until more slots free up. The current
+	 * number of reserved allocations is maintained by the mc_alloc_slots
+	 * refcount. The mc_alloc_max_slots value determines the maximum
+	 * number of allocations that the system allows. Gang blocks are
+	 * allowed to reserve slots even if we've reached the maximum
+	 * number of allocations allowed.
+	 */
+	uint64_t		mc_alloc_max_slots;
+	refcount_t		mc_alloc_slots;
+
+	uint64_t		mc_alloc_groups; /* # of allocatable groups */
+
 	uint64_t		mc_alloc;	/* total allocated space */
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
+	uint64_t		mc_minblocksize;
+	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 };
 
+/*
+ * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
+ * of a top-level vdev. They are linked togther to form a circular linked
+ * list and can belong to only one metaslab class. Metaslab groups may become
+ * ineligible for allocations for a number of reasons such as limited free
+ * space, fragmentation, or going offline. When this happens the allocator will
+ * simply find the next metaslab group in the linked list and attempt
+ * to allocate from that group instead.
+ */
 struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
-	uint64_t		mg_bonus_area;
-	uint64_t		mg_alloc_failures;
+	boolean_t		mg_allocatable;		/* can we allocate? */
+
+	/*
+	 * A metaslab group is considered to be initialized only after
+	 * we have updated the MOS config and added the space to the pool.
+	 * We only allow allocation attempts to a metaslab group if it
+	 * has been initialized.
+	 */
+	boolean_t		mg_initialized;
+
+	uint64_t		mg_free_capacity;	/* percentage free */
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
+	taskq_t			*mg_taskq;
 	metaslab_group_t	*mg_prev;
 	metaslab_group_t	*mg_next;
+
+	/*
+	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+	 * which are tracked by mg_alloc_queue_depth. It's possible for a
+	 * metaslab group to handle more allocations than its max. This
+	 * can occur when gang blocks are required or when other groups
+	 * are unable to handle their share of allocations.
+	 */
+	uint64_t		mg_max_alloc_queue_depth;
+	refcount_t		mg_alloc_queue_depth;
+
+	/*
+	 * A metalab group that can no longer allocate the minimum block
+	 * size will set mg_no_free_space. Once a metaslab group is out
+	 * of space then its share of work must be distributed to other
+	 * groups.
+	 */
+	boolean_t		mg_no_free_space;
+
+	uint64_t		mg_allocations;
+	uint64_t		mg_failed_allocations;
+	uint64_t		mg_fragmentation;
+	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 };
 
 /*
- * Each metaslab maintains an in-core free map (ms_map) that contains the
- * current list of free segments. As blocks are allocated, the allocated
- * segment is removed from the ms_map and added to a per txg allocation map.
- * As blocks are freed, they are added to the per txg free map. These per
- * txg maps allow us to process all allocations and frees in syncing context
- * where it is safe to update the on-disk space maps.
+ * This value defines the number of elements in the ms_lbas array. The value
+ * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
+ * This is the equivalent of highbit(UINT64_MAX).
+ */
+#define	MAX_LBAS	64
+
+/*
+ * Each metaslab maintains a set of in-core trees to track metaslab operations.
+ * The in-core free tree (ms_tree) contains the current list of free segments.
+ * As blocks are allocated, the allocated segment are removed from the ms_tree
+ * and added to a per txg allocation tree (ms_alloctree). As blocks are freed,
+ * they are added to the per txg free tree (ms_freetree). These per txg
+ * trees allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. One additional in-core
+ * tree is maintained to track deferred frees (ms_defertree). Once a block
+ * is freed it will move from the ms_freetree to the ms_defertree. A deferred
+ * free means that a block has been freed but cannot be used by the pool
+ * until TXG_DEFER_SIZE transactions groups later. For example, a block
+ * that is freed in txg 50 will not be available for reallocation until
+ * txg 52 (50 + TXG_DEFER_SIZE).  This provides a safety net for uberblock
+ * rollback. A pool could be safely rolled back TXG_DEFERS_SIZE
+ * transactions groups and ensure that no block has been reallocated.
  *
- * Each metaslab's free space is tracked in a space map object in the MOS,
+ * The simplified transition diagram looks like this:
+ *
+ *
+ *      ALLOCATE
+ *         |
+ *         V
+ *    free segment (ms_tree) --------> ms_alloctree ----> (write to space map)
+ *         ^
+ *         |
+ *         |                           ms_freetree <--- FREE
+ *         |                                 |
+ *         |                                 |
+ *         |                                 |
+ *         +----------- ms_defertree <-------+---------> (write to space map)
+ *
+ *
+ * Each metaslab's space is tracked in a single space map in the MOS,
  * which is only updated in syncing context. Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map object.
- * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ * we append the allocs and frees from that txg to the space map.
+ * The pool space is only updated once all metaslabs have finished syncing.
  *
- * To load the in-core free map we read the space map object from disk.
+ * To load the in-core free tree we read the space map from disk.
  * This object contains a series of alloc and free records that are
  * combined to make up the list of all free segments in this metaslab. These
- * segments are represented in-core by the ms_map and are stored in an
+ * segments are represented in-core by the ms_tree and are stored in an
  * AVL tree.
  *
- * As the space map objects grows (as a result of the appends) it will
- * eventually become space-inefficient. When the space map object is
- * zfs_condense_pct/100 times the size of the minimal on-disk representation,
- * we rewrite it in its minimized form.
+ * As the space map grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the metaslab's in-core free tree
+ * is zfs_condense_pct/100 times the size of the minimal on-disk
+ * representation, we rewrite it in its minimized form. If a metaslab
+ * needs to condense then we must set the ms_condensing flag to ensure
+ * that allocations are not performed on the metaslab that is being written.
  */
 struct metaslab {
-	kmutex_t	ms_lock;	/* metaslab lock		*/
-	space_map_obj_t	ms_smo;		/* synced space map object	*/
-	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
-	space_map_t	*ms_allocmap[TXG_SIZE];	/* allocated this txg	*/
-	space_map_t	*ms_freemap[TXG_SIZE];	/* freed this txg	*/
-	space_map_t	*ms_defermap[TXG_DEFER_SIZE];	/* deferred frees */
-	space_map_t	*ms_map;	/* in-core free space map	*/
+	kmutex_t	ms_lock;
+	kcondvar_t	ms_load_cv;
+	space_map_t	*ms_sm;
+	metaslab_ops_t	*ms_ops;
+	uint64_t	ms_id;
+	uint64_t	ms_start;
+	uint64_t	ms_size;
+	uint64_t	ms_fragmentation;
+
+	range_tree_t	*ms_alloctree[TXG_SIZE];
+	range_tree_t	*ms_freetree[TXG_SIZE];
+	range_tree_t	*ms_defertree[TXG_DEFER_SIZE];
+	range_tree_t	*ms_tree;
+
+	boolean_t	ms_condensing;	/* condensing? */
+	boolean_t	ms_condense_wanted;
+	boolean_t	ms_loaded;
+	boolean_t	ms_loading;
+
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
+	uint64_t	ms_access_txg;
+
+	/*
+	 * The metaslab block allocators can optionally use a size-ordered
+	 * range tree and/or an array of LBAs. Not all allocators use
+	 * this functionality. The ms_size_tree should always contain the
+	 * same number of segments as the ms_tree. The only difference
+	 * is that the ms_size_tree is ordered by segment sizes.
+	 */
+	avl_tree_t	ms_size_tree;
+	uint64_t	ms_lbas[MAX_LBAS];
+
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_REFCOUNT_H
@@ -64,6 +65,7 @@
 
 void refcount_create(refcount_t *rc);
 void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
 void refcount_destroy_many(refcount_t *rc, uint64_t number);
 int refcount_is_zero(refcount_t *rc);
@@ -73,6 +75,9 @@
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
 void refcount_transfer(refcount_t *dst, refcount_t *src);
+void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);
 
 void refcount_sysinit(void);
 void refcount_fini(void);
@@ -85,12 +90,13 @@
 
 #define	refcount_create(rc) ((rc)->rc_count = 0)
 #define	refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define	refcount_create_tracked(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
 #define	refcount_is_zero(rc) ((rc)->rc_count == 0)
 #define	refcount_count(rc) ((rc)->rc_count)
-#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define	refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
+#define	refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
 #define	refcount_add_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, number)
 #define	refcount_remove_many(rc, number, holder) \
@@ -100,6 +106,9 @@
 	atomic_add_64(&(src)->rc_count, -__tmp); \
 	atomic_add_64(&(dst)->rc_count, __tmp); \
 }
+#define	refcount_transfer_ownership(rc, current_holder, new_holder)
+#define	refcount_held(rc, holder)		((rc)->rc_count > 0)
+#define	refcount_not_held(rc, holder)		(B_TRUE)
 
 #define	refcount_sysinit()
 #define	refcount_fini()

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -69,6 +70,7 @@
 void rrw_destroy(rrwlock_t *rrl);
 void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
 void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
 void rrw_enter_write(rrwlock_t *rrl);
 void rrw_exit(rrwlock_t *rrl, void *tag);
 boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
@@ -79,6 +81,31 @@
 #define	RRW_LOCK_HELD(x) \
 	(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
 
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number.  See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define	RRM_NUM_LOCKS		17
+typedef struct rrmlock {
+	rrwlock_t	locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define	RRM_READ_HELD(x)	rrm_held(x, RW_READER)
+#define	RRM_WRITE_HELD(x)	rrm_held(x, RW_WRITER)
+#define	RRM_LOCK_HELD(x) \
+	(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -134,7 +135,6 @@
     uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
 void sa_object_info(sa_handle_t *, dmu_object_info_t *);
 void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
-void sa_update_user(sa_handle_t *, sa_handle_t *);
 void *sa_get_userdata(sa_handle_t *);
 void sa_set_userp(sa_handle_t *, void *);
 dmu_buf_t *sa_get_db(sa_handle_t *);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,8 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_SA_IMPL_H
@@ -153,12 +155,13 @@
  *
  * The header has a fixed portion with a variable number
  * of "lengths" depending on the number of variable sized
- * attribues which are determined by the "layout number"
+ * attributes which are determined by the "layout number"
  */
 
 #define	SA_MAGIC	0x2F505A  /* ZFS SA */
 typedef struct sa_hdr_phys {
 	uint32_t sa_magic;
+	/* BEGIN CSTYLED */
 	/*
 	 * Encoded with hdrsize and layout number as follows:
 	 * 16      10       0
@@ -175,6 +178,7 @@
 	 *          2 ==> 16 byte header
 	 *
 	 */
+	/* END CSTYLED */
 	uint16_t sa_layout_info;
 	uint16_t sa_lengths[1];	/* optional sizes for variable length attrs */
 	/* ... Data follows the lengths.  */
@@ -208,11 +212,12 @@
  */
 
 struct sa_handle {
+	dmu_buf_user_t	sa_dbu;
 	kmutex_t	sa_lock;
 	dmu_buf_t	*sa_bonus;
 	dmu_buf_t	*sa_spill;
 	objset_t	*sa_os;
-	void 		*sa_userp;
+	void		*sa_userp;
 	sa_idx_tab_t	*sa_bonus_tab;	 /* idx of bonus */
 	sa_idx_tab_t	*sa_spill_tab; /* only present if spill activated */
 };

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,8 +21,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _SYS_SPA_H
@@ -65,34 +69,72 @@
 #define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
 #define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
 
-#define	BF32_SET(x, low, len, val)	\
-	((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
-#define	BF64_SET(x, low, len, val)	\
-	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+#define	BF32_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1U << (len)); \
+	ASSERT3U(low + len, <=, 32); \
+	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
 
+#define	BF64_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1ULL << (len)); \
+	ASSERT3U(low + len, <=, 64); \
+	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
 #define	BF32_GET_SB(x, low, len, shift, bias)	\
 	((BF32_GET(x, low, len) + (bias)) << (shift))
 #define	BF64_GET_SB(x, low, len, shift, bias)	\
 	((BF64_GET(x, low, len) + (bias)) << (shift))
 
-#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
-	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
-#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
-	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define	SPA_MINBLOCKSHIFT	9
-#define	SPA_MAXBLOCKSHIFT	17
+#define	SPA_OLD_MAXBLOCKSHIFT	17
+#define	SPA_MAXBLOCKSHIFT	24
 #define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
+#define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
 
-#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+/*
+ * Default maximum supported logical ashift.
+ *
+ * The current 8k allocation block size limit is due to the 8k
+ * aligned/sized operations performed by vdev_probe() on
+ * vdev_label->vl_pad2.  Using another "safe region" for these tests
+ * would allow the limit to be raised to 16k, at the expense of
+ * only having 8 available uberblocks in the label area.
+ */
+#define	SPA_MAXASHIFT		13
 
 /*
+ * Default minimum supported logical ashift.
+ */
+#define SPA_MINASHIFT		SPA_MINBLOCKSHIFT
+
+/*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
@@ -108,6 +150,8 @@
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
+#define	SPA_COMPRESSBITS	7
+
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
@@ -124,6 +168,14 @@
 } zio_cksum_t;
 
 /*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+	uint8_t		zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
@@ -141,7 +193,7 @@
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -175,7 +227,8 @@
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
- * X		unused
+ * X		encryption (on version 30, which is not supported)
+ * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg of block allocation; zero if same as logical birth txg
@@ -183,9 +236,112 @@
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself.  See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ *	64	56	48	40	32	24	16	8	0
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0	|      payload                                                  |
+ * 1	|      payload                                                  |
+ * 2	|      payload                                                  |
+ * 3	|      payload                                                  |
+ * 4	|      payload                                                  |
+ * 5	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7	|      payload                                                  |
+ * 8	|      payload                                                  |
+ * 9	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * a	|			logical birth txg			|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * b	|      payload                                                  |
+ * c	|      payload                                                  |
+ * d	|      payload                                                  |
+ * e	|      payload                                                  |
+ * f	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload		contains the embedded data
+ * B (byteorder)	byteorder (endianness)
+ * D (dedup)		padding (set to zero)
+ * X			encryption (set to zero; see above)
+ * E (embedded)		set to one
+ * lvl			indirection level
+ * type			DMU object type
+ * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp			compression function of payload
+ * PSIZE		size of payload after compression, in bytes
+ * LSIZE		logical size of payload, in bytes
+ *			note that 25 bits is enough to store the largest
+ *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth		transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define	BPE_GET_ETYPE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET((bp)->blk_prop, 40, 8))
+#define	BPE_SET_ETYPE(bp, t)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BPE_GET_LSIZE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define	BPE_SET_LSIZE(bp, x)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BPE_GET_PSIZE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define	BPE_SET_PSIZE(bp, x)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+	BP_EMBEDDED_TYPE_DATA,
+	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+	NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define	BPE_NUM_WORDS 14
+#define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define	BPE_IS_PAYLOADWORD(bp, wp) \
+	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
+/*
+ * A block is a hole when it has either 1) never been written to, or
+ * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
+ * without physically allocating disk space. Holes are represented in the
+ * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
+ * done through the BP_IS_HOLE macro. For holes, the logical size, level,
+ * DMU object type, and birth times are all also stored for holes that
+ * were written to at some point (i.e. were punched after having been filled).
+ */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
@@ -200,9 +356,10 @@
  * Macros to get and set fields in a bp or DVA.
  */
 #define	DVA_GET_ASIZE(dva)	\
-	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
+	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
@@ -219,21 +376,40 @@
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-#define	BP_SET_LSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+	(BP_IS_EMBEDDED(bp) ?	\
+	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_LSIZE(bp, x)	do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, \
+	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
-	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define	BP_SET_PSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_PSIZE(bp, x)	do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, \
+	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
 
-#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
+#define	BP_GET_COMPRESS(bp)		\
+	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
+#define	BP_SET_COMPRESS(bp, x)		\
+	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
 
-#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
+#define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
+#define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
+#define	BP_GET_CHECKSUM(bp)		\
+	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+	BF64_GET((bp)->blk_prop, 40, 8))
+#define	BP_SET_CHECKSUM(bp, x)		do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
+
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
@@ -240,27 +416,30 @@
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
-#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
-#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
-
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
-#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
-	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
+	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
+#define	BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
 #define	BP_GET_ASIZE(bp)	\
-	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-		DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_GET_UCSIZE(bp) \
 	((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
@@ -267,14 +446,16 @@
 	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
-	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_COUNT_GANG(bp)	\
+	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[2]))
+	DVA_GET_GANG(&(bp)->blk_dva[2])))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@@ -282,6 +463,7 @@
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -292,6 +474,19 @@
 	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
 	((zc1).zc_word[3] - (zc2).zc_word[3])))
 
+#define	ZIO_CHECKSUM_IS_ZERO(zc) \
+	(0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
+	(zc)->zc_word[2] | (zc)->zc_word[3]))
+
+#define	ZIO_CHECKSUM_BSWAP(zcp)					\
+{								\
+	(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);	\
+	(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);	\
+	(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);	\
+	(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);	\
+}
+
+
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
@@ -302,9 +497,13 @@
 	(zcp)->zc_word[3] = w3;			\
 }
 
-#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
-#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
-#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
+#define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define	BP_IS_GANG(bp)		\
+	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
+#define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
+				(dva)->dva_word[1] == 0ULL)
+#define	BP_IS_HOLE(bp) \
+	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@@ -327,14 +526,10 @@
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
-/*
- * Note: the byteorder is either 0 or -1, both of which are palindromes.
- * This simplifies the endianness handling a bit.
- */
 #if BYTE_ORDER == _BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
-#define	ZFS_HOST_BYTEORDER	(-1ULL)
+#define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
@@ -346,18 +541,34 @@
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
-#define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
+#define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
-	int size = BP_SPRINTF_LEN;					\
 	int len = 0;							\
 	int copies = 0;							\
 									\
 	if (bp == NULL) {						\
-		len = func(buf + len, size - len, "<NULL>");		\
+		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
-		len = func(buf + len, size - len, "<hole>");		\
+		len += func(buf + len, size - len,			\
+		    "HOLE [L%llu %s] "					\
+		    "size=%llxL birth=%lluL",				\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth);			\
+	} else if (BP_IS_EMBEDDED(bp)) {				\
+		len = func(buf + len, size - len,			\
+		    "EMBEDDED [L%llu %s] et=%u %s "			\
+		    "size=%llxL/%llxP birth=%lluL",			\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    (int)BPE_GET_ETYPE(bp),				\
+		    compress,						\
+		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
+		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
@@ -391,7 +602,7 @@
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
-		    (u_longlong_t)bp->blk_fill,				\
+		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
@@ -420,7 +631,7 @@
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     nvlist_t *zplprops);
-#if defined(sun)
+#ifdef illumos
 extern int spa_import_rootpool(char *devpath, char *devid);
 #else
 extern int spa_import_rootpool(const char *name);
@@ -521,6 +732,7 @@
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
@@ -587,11 +799,15 @@
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
@@ -603,7 +819,8 @@
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
-extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
+extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
+    dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern int spa_rename(const char *oldname, const char *newname);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
@@ -612,7 +829,7 @@
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
 extern uint64_t spa_generate_guid(spa_t *spa);
-extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
+extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
@@ -626,6 +843,9 @@
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
+extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
+extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
@@ -647,7 +867,7 @@
     dmu_tx_t *tx, const char *fmt, ...);
 
 /* error handling */
-struct zbookmark;
+struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
     zio_t *zio, uint64_t stateoroffset, uint64_t length);
@@ -681,9 +901,9 @@
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, (bp));				\
+	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,9 +21,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -39,6 +42,8 @@
 #include <sys/refcount.h>
 #include <sys/bplist.h>
 #include <sys/bpobj.h>
+#include <sys/zfeature.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -45,9 +50,9 @@
 #endif
 
 typedef struct spa_error_entry {
-	zbookmark_t	se_bookmark;
-	char		*se_name;
-	avl_node_t	se_avl;
+	zbookmark_phys_t	se_bookmark;
+	char			*se_name;
+	avl_node_t		se_avl;
 } spa_error_entry_t;
 
 typedef struct spa_history_phys {
@@ -81,16 +86,16 @@
 	char		*scd_path;
 } spa_config_dirent_t;
 
-enum zio_taskq_type {
+typedef enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
 	ZIO_TASKQ_ISSUE_HIGH,
 	ZIO_TASKQ_INTERRUPT,
 	ZIO_TASKQ_INTERRUPT_HIGH,
 	ZIO_TASKQ_TYPES
-};
+} zio_taskq_type_t;
 
 /*
- * State machine for the zpool-pooname process.  The states transitions
+ * State machine for the zpool-poolname process.  The states transitions
  * are done as follows:
  *
  *	From		   To			Routine
@@ -108,11 +113,16 @@
 	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
 } spa_proc_state_t;
 
+typedef struct spa_taskqs {
+	uint_t stqs_count;
+	taskq_t **stqs_taskq;
+} spa_taskqs_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
 	 */
-	char		spa_name[MAXNAMELEN];	/* pool name */
+	char		spa_name[ZFS_MAX_DATASET_NAME_LEN];	/* pool name */
 	char		*spa_comment;		/* comment */
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
@@ -126,7 +136,7 @@
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
 	uint64_t	spa_import_flags;	/* import specific flags */
-	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
+	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
 	boolean_t	spa_is_initializing;	/* true while opening pool */
 	metaslab_class_t *spa_normal_class;	/* normal data class */
@@ -138,13 +148,20 @@
 	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
 	timespec_t	spa_loaded_ts;		/* 1st successful open time */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
+	kmutex_t	spa_evicting_os_lock;	/* Evicting objset list lock */
+	list_t		spa_evicting_os_list;	/* Objsets being evicted. */
+	kcondvar_t	spa_evicting_os_cv;	/* Objset Eviction Completion */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
+	int		spa_min_ashift;		/* of vdevs in normal class */
+	int		spa_max_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
+	kmutex_t	spa_alloc_lock;
+	avl_tree_t	spa_alloc_tree;
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
@@ -153,6 +170,10 @@
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
 	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
 	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+	zio_cksum_salt_t spa_cksum_salt;	/* secret salt for cksum */
+	/* checksum context templates */
+	kmutex_t	spa_cksum_tmpls_lock;
+	void		*spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
@@ -169,6 +190,7 @@
 	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
+	kthread_t	*spa_async_thread_vd;	/* thread doing vd async task */
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
@@ -198,7 +220,8 @@
 	uint64_t	spa_failmode;		/* failure mode for the pool */
 	uint64_t	spa_delegation;		/* delegation on/off */
 	list_t		spa_config_list;	/* previous cache file(s) */
-	zio_t		*spa_async_zio_root;	/* root of all async I/O */
+	/* per-CPU array of root of async I/O: */
+	zio_t		**spa_async_zio_root;
 	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
@@ -231,17 +254,34 @@
 	uint64_t	spa_feat_for_write_obj;	/* required to write to pool */
 	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
 	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
+	uint64_t	spa_feat_enabled_txg_obj; /* Feature enabled txg */
+	/* cache feature refcounts */
+	uint64_t	spa_feat_refcount_cache[SPA_FEATURES];
 #ifdef illumos
 	cyclic_id_t	spa_deadman_cycid;	/* cyclic id */
-#else	/* FreeBSD */
+#else	/* !illumos */
 #ifdef _KERNEL
 	struct callout	spa_deadman_cycid;	/* callout id */
+	struct task	spa_deadman_task;
 #endif
 #endif	/* illumos */
 	uint64_t	spa_deadman_calls;	/* number of deadman calls */
-	uint64_t	spa_sync_starttime;	/* starting time fo spa_sync */
+	hrtime_t	spa_sync_starttime;	/* starting time fo spa_sync */
 	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
+#ifdef illumos
+	/*
+	 * spa_iokstat_lock protects spa_iokstat and
+	 * spa_queue_stats[].
+	 */
+	kmutex_t	spa_iokstat_lock;
+	struct kstat	*spa_iokstat;		/* kstat of io to this pool */
+	struct {
+		int spa_active;
+		int spa_queued;
+	} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
+#endif
 	hrtime_t	spa_ccw_fail_time;	/* Conf cache write fail time */
+
 	/*
 	 * spa_refcount & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
@@ -250,7 +290,7 @@
 	 */
 	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
 	refcount_t	spa_refcount;		/* number of opens */
-#ifndef sun
+#ifndef illumos
 	boolean_t	spa_splitting_newspa;	/* creating new spa in split */
 #endif
 };
@@ -257,6 +297,9 @@
 
 extern const char *spa_config_path;
 
+extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPACE_MAP_H
@@ -31,6 +32,7 @@
 #define	_SYS_SPACE_MAP_H
 
 #include <sys/avl.h>
+#include <sys/range_tree.h>
 #include <sys/dmu.h>
 
 #ifdef	__cplusplus
@@ -37,53 +39,56 @@
 extern "C" {
 #endif
 
-typedef struct space_map_ops space_map_ops_t;
+/*
+ * The size of the space map object has increased to include a histogram.
+ * The SPACE_MAP_SIZE_V0 designates the original size and is used to
+ * maintain backward compatibility.
+ */
+#define	SPACE_MAP_SIZE_V0	(3 * sizeof (uint64_t))
+#define	SPACE_MAP_HISTOGRAM_SIZE	32
 
+/*
+ * The space_map_phys is the on-disk representation of the space map.
+ * Consumers of space maps should never reference any of the members of this
+ * structure directly. These members may only be updated in syncing context.
+ *
+ * Note the smp_object is no longer used but remains in the structure
+ * for backward compatibility.
+ */
+typedef struct space_map_phys {
+	uint64_t	smp_object;	/* on-disk space map object */
+	uint64_t	smp_objsize;	/* size of the object */
+	uint64_t	smp_alloc;	/* space allocated from the map */
+	uint64_t	smp_pad[5];	/* reserved */
+
+	/*
+	 * The smp_histogram maintains a histogram of free regions. Each
+	 * bucket, smp_histogram[i], contains the number of free regions
+	 * whose size is:
+	 * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+	 */
+	uint64_t	smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
+} space_map_phys_t;
+
+/*
+ * The space map object defines a region of space, its size, how much is
+ * allocated, and the on-disk object that stores this information.
+ * Consumers of space maps may only access the members of this structure.
+ */
 typedef struct space_map {
-	avl_tree_t	sm_root;	/* offset-ordered segment AVL tree */
-	uint64_t	sm_space;	/* sum of all segments in the map */
 	uint64_t	sm_start;	/* start of map */
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
-	uint8_t		sm_loaded;	/* map loaded? */
-	uint8_t		sm_loading;	/* map loading? */
-	uint8_t		sm_condensing;	/* map condensing? */
-	kcondvar_t	sm_load_cv;	/* map load completion */
-	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
-	avl_tree_t	*sm_pp_root;	/* size-ordered, picker-private tree */
-	void		*sm_ppd;	/* picker-private data */
+	uint64_t	sm_length;	/* synced length */
+	uint64_t	sm_alloc;	/* synced space allocated */
+	objset_t	*sm_os;		/* objset for this map */
+	uint64_t	sm_object;	/* object id for this map */
+	uint32_t	sm_blksz;	/* block size for space map */
+	dmu_buf_t	*sm_dbuf;	/* space_map_phys_t dbuf */
+	space_map_phys_t *sm_phys;	/* on-disk space map */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
-typedef struct space_seg {
-	avl_node_t	ss_node;	/* AVL node */
-	avl_node_t	ss_pp_node;	/* AVL picker-private node */
-	uint64_t	ss_start;	/* starting offset of this segment */
-	uint64_t	ss_end;		/* ending offset (non-inclusive) */
-} space_seg_t;
-
-typedef struct space_ref {
-	avl_node_t	sr_node;	/* AVL node */
-	uint64_t	sr_offset;	/* offset (start or end) */
-	int64_t		sr_refcnt;	/* associated reference count */
-} space_ref_t;
-
-typedef struct space_map_obj {
-	uint64_t	smo_object;	/* on-disk space map object */
-	uint64_t	smo_objsize;	/* size of the object */
-	uint64_t	smo_alloc;	/* space allocated from the map */
-} space_map_obj_t;
-
-struct space_map_ops {
-	void	(*smop_load)(space_map_t *sm);
-	void	(*smop_unload)(space_map_t *sm);
-	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
-	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
-	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
-	uint64_t (*smop_max)(space_map_t *sm);
-	boolean_t (*smop_fragmented)(space_map_t *sm);
-};
-
 /*
  * debug entry
  *
@@ -124,61 +129,34 @@
 
 #define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
 
-#define	SM_ALLOC	0x0
-#define	SM_FREE		0x1
+typedef enum {
+	SM_ALLOC,
+	SM_FREE
+} maptype_t;
 
-/*
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
- * when only a few blocks have changed since the last transaction group.
- * This could use a lot more research, but for now, set the freelist
- * block size to 4k (2^12).
- */
-#define	SPACE_MAP_BLOCKSHIFT	12
+int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
 
-typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+void space_map_histogram_clear(space_map_t *sm);
+void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
+    dmu_tx_t *tx);
 
-extern void space_map_init(void);
-extern void space_map_fini(void);
-extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
-    uint8_t shift, kmutex_t *lp);
-extern void space_map_destroy(space_map_t *sm);
-extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern boolean_t space_map_contains(space_map_t *sm,
-    uint64_t start, uint64_t size);
-extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start,
-    uint64_t size, avl_index_t *wherep);
-extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
-extern void space_map_vacate(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
-extern void space_map_walk(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
+void space_map_update(space_map_t *sm);
 
-extern void space_map_load_wait(space_map_t *sm);
-extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
-    uint8_t maptype, space_map_obj_t *smo, objset_t *os);
-extern void space_map_unload(space_map_t *sm);
+uint64_t space_map_object(space_map_t *sm);
+uint64_t space_map_allocated(space_map_t *sm);
+uint64_t space_map_length(space_map_t *sm);
 
-extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
-extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
-extern uint64_t space_map_maxsize(space_map_t *sm);
+void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_free(space_map_t *sm, dmu_tx_t *tx);
 
-extern void space_map_sync(space_map_t *sm, uint8_t maptype,
-    space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
-extern void space_map_truncate(space_map_obj_t *smo,
-    objset_t *os, dmu_tx_t *tx);
+int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+    uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
+void space_map_close(space_map_t *sm);
 
-extern void space_map_ref_create(avl_tree_t *t);
-extern void space_map_ref_destroy(avl_tree_t *t);
-extern void space_map_ref_add_seg(avl_tree_t *t,
-    uint64_t start, uint64_t end, int64_t refcnt);
-extern void space_map_ref_add_map(avl_tree_t *t,
-    space_map_t *sm, int64_t refcnt);
-extern void space_map_ref_generate_map(avl_tree_t *t,
-    space_map_t *sm, int64_t minref);
+int64_t space_map_alloc_delta(space_map_t *sm);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -23,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_TXG_H
@@ -74,13 +75,9 @@
 extern void txg_rele_to_sync(txg_handle_t *txghp);
 extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first).  This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+    hrtime_t resolution);
+extern void txg_kick(struct dsl_pool *dp);
 
 /*
  * Wait until the given transaction group has finished syncing.
@@ -116,6 +113,7 @@
 extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
 extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_all_lists_empty(txg_list_t *tl);
 extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
 extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -18,6 +19,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -70,7 +72,7 @@
 	kmutex_t	tc_open_lock;	/* protects tx_open_txg */
 	kmutex_t	tc_lock;	/* protects the rest of this struct */
 	kcondvar_t	tc_cv[TXG_SIZE];
-	uint64_t	tc_count[TXG_SIZE];
+	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
 	char		tc_pad[8];		/* pad to fill 3 cache lines */
 };
@@ -87,13 +89,16 @@
  * every cpu (see txg_quiesce()).
  */
 typedef struct tx_state {
-	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
-	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
+	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
+	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
+
 	uint64_t	tx_open_txg;	/* currently open txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
 	uint64_t	tx_synced_txg;	/* last synced txg id */
 
+	hrtime_t	tx_open_time;	/* start time of tx_open_txg */
+
 	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
 	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -22,6 +23,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_UBERBLOCK_H
 #define	_SYS_UBERBLOCK_H
@@ -36,8 +40,8 @@
 
 typedef struct uberblock uberblock_t;
 
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+extern int uberblock_verify(uberblock_t *);
+extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
@@ -54,6 +56,12 @@
 
 	/* highest SPA_VERSION supported by software that wrote this txg */
 	uint64_t	ub_software_version;
+
+	/* These fields are reserved for features that are under development: */
+	uint64_t	ub_mmp_magic;
+	uint64_t	ub_mmp_delay;
+	uint64_t	ub_mmp_seq;
+	uint64_t	ub_checkpoint_txg;
 };
 
 #ifdef	__cplusplus

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,7 +22,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
@@ -61,6 +62,7 @@
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
@@ -78,6 +80,7 @@
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_ashift_optimize(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd);
@@ -111,7 +114,7 @@
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
-extern int vdev_cache_read(zio_t *zio);
+extern boolean_t vdev_cache_read(zio_t *zio);
 extern void vdev_cache_write(zio_t *zio);
 extern void vdev_cache_purge(vdev_t *vd);
 
@@ -119,11 +122,13 @@
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
-    boolean_t);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
@@ -158,6 +163,8 @@
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
 
+extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,13 +22,13 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define	_SYS_VDEV_DISK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
@@ -40,14 +41,25 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
+	list_t		vd_ldi_cbs;
+	boolean_t	vd_ldi_offline;
 } vdev_disk_t;
+#endif
 
+extern int vdev_disk_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, int, boolean_t);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -39,6 +40,9 @@
 	vnode_t		*vf_vnode;
 } vdev_file_t;
 
+extern void vdev_file_init(void);
+extern void vdev_file_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -53,14 +54,17 @@
 typedef struct vdev_cache vdev_cache_t;
 typedef struct vdev_cache_entry vdev_cache_entry_t;
 
+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
 /*
  * Virtual device operations
  */
 typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
-    uint64_t *ashift);
+    uint64_t *logical_ashift, uint64_t *physical_ashift);
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int	vdev_io_start_func_t(zio_t *zio);
+typedef void	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 typedef void	vdev_hold_func_t(vdev_t *vd);
@@ -99,13 +103,26 @@
 	kmutex_t	vc_lock;
 };
 
+typedef struct vdev_queue_class {
+	uint32_t	vqc_active;
+
+	/*
+	 * Sorted by offset or timestamp, depending on if the queue is
+	 * LBA-ordered vs FIFO.
+	 */
+	avl_tree_t	vqc_queued_tree;
+} vdev_queue_class_t;
+
 struct vdev_queue {
-	avl_tree_t	vq_deadline_tree;
-	avl_tree_t	vq_read_tree;
-	avl_tree_t	vq_write_tree;
-	avl_tree_t	vq_pending_tree;
-	hrtime_t	vq_io_complete_ts;
+	vdev_t		*vq_vdev;
+	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+	avl_tree_t	vq_active_tree;
+	avl_tree_t	vq_read_offset_tree;
+	avl_tree_t	vq_write_offset_tree;
+	uint64_t	vq_last_offset;
+	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
+	uint64_t	vq_lastoffset;
 };
 
 /*
@@ -123,6 +140,24 @@
 	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
 	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
 	uint64_t	vdev_ashift;	/* block alignment shift	*/
+	/*
+	 * Logical block alignment shift
+	 *
+	 * The smallest sized/aligned I/O supported by the device.
+	 */
+	uint64_t        vdev_logical_ashift;
+	/*
+	 * Physical block alignment shift
+	 *
+	 * The device supports logical I/Os with vdev_logical_ashift
+	 * size/alignment, but optimum performance will be achieved by
+	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
+	 * requests may be inflated or incur device level read-modify-write
+	 * operations.
+	 *
+	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
+         */
+	uint64_t        vdev_physical_ashift;
 	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
 	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
 	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
@@ -134,7 +169,6 @@
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl[DTL_TYPES]; /* in-core dirty time logs	*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
@@ -155,25 +189,38 @@
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
-	uint64_t	vdev_removing;	/* device is being removed?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
-	uint64_t	vdev_ishole;	/* is a hole in the namespace 	*/
+	uint64_t	vdev_removing;	/* device is being removed?	*/
+	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
+	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
 
 	/*
+	 * The queue depth parameters determine how many async writes are
+	 * still pending (i.e. allocated by net yet issued to disk) per
+	 * top-level (vdev_async_write_queue_depth) and the maximum allowed
+	 * (vdev_max_async_write_queue_depth). These values only apply to
+	 * top-level vdevs.
+	 */
+	uint64_t	vdev_async_write_queue_depth;
+	uint64_t	vdev_max_async_write_queue_depth;
+
+	/*
 	 * Leaf vdev state.
 	 */
+	range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs	*/
+	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
+	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
+	uint64_t	vdev_dtl_object; /* DTL object			*/
 	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl_smo;	/* dirty time log space map obj	*/
-	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
 	uint64_t	vdev_faulted;	/* persistent faulted state	*/
 	uint64_t	vdev_degraded;	/* persistent degraded state	*/
 	uint64_t	vdev_removed;	/* persistent removed state	*/
-	uint64_t	vdev_resilvering; /* persistent resilvering state */
+	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
 	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
@@ -181,7 +228,6 @@
 	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
-	hrtime_t	vdev_last_try;	/* last reopen time		*/
 	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
 	boolean_t	vdev_notrim;	/* true if trim failed */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
@@ -188,18 +234,21 @@
 	boolean_t	vdev_forcefault; /* force online fault		*/
 	boolean_t	vdev_splitting;	/* split or repair in progress  */
 	boolean_t	vdev_delayed_close; /* delayed device close?	*/
-	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
-	uint8_t		vdev_detached;	/* device detached?		*/
-	uint8_t		vdev_cant_read;	/* vdev is failing all reads	*/
-	uint8_t		vdev_cant_write; /* vdev is failing all writes	*/
-	uint64_t	vdev_isspare;	/* was a hot spare		*/
-	uint64_t	vdev_isl2cache;	/* was a l2cache device		*/
+	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
+	boolean_t	vdev_detached;	/* device detached?		*/
+	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
+	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
+	boolean_t	vdev_isspare;	/* was a hot spare		*/
+	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
-	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
+	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
-	struct trim_map	*vdev_trimmap;
+	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/ 
+	uint16_t	vdev_rotation_rate; /* rotational rate of the media */
+#define	VDEV_RATE_UNKNOWN	0
+#define	VDEV_RATE_NON_ROTATING	1
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
@@ -221,8 +270,11 @@
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
+/* The largest uberblock we support is 8k. */
+#define	MAX_UBERBLOCK_SHIFT (13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+	    MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
@@ -295,9 +347,11 @@
 extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern void vdev_load(vdev_t *vd);
+extern int vdev_dtl_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
 
 /*
  * Available vdev types.
@@ -328,7 +382,18 @@
  */
 /* zdb uses this tunable, so it must be declared here to make lint happy. */
 extern int zfs_vdev_cache_size;
+extern uint_t zfs_geom_probe_vdev_key;
 
+#ifdef illumos
+/*
+ * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
+ */
+typedef struct vdev_buf {
+	buf_t	vb_buf;		/* buffer that describes the io */
+	zio_t	*vb_io;		/* pointer back to the original zio_t */
+} vdev_buf_t;
+#endif
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_H
@@ -80,6 +81,7 @@
  */
 
 #include <sys/dmu.h>
+#include <sys/refcount.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -141,6 +143,12 @@
     uint64_t parent_obj, const char *name, dmu_tx_t *tx);
 
 /*
+ * Initialize an already-allocated object.
+ */
+void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
+    zap_flags_t flags, dmu_tx_t *tx);
+
+/*
  * Create a new zapobj with no attributes from the given (unallocated)
  * object number.
  */
@@ -209,9 +217,15 @@
 int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
 int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp);
 
-int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
-    int add, uint64_t *towrite, uint64_t *tooverwrite);
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
+    int add, refcount_t *towrite, refcount_t *tooverwrite);
 
 /*
  * Create an attribute with the given name and value.
@@ -337,7 +351,7 @@
 	boolean_t za_normalization_conflict;
 	uint64_t za_num_integers;
 	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
-	char za_name[MAXNAMELEN];
+	char za_name[ZAP_MAXNAMELEN];
 } zap_attribute_t;
 
 /*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
@@ -41,8 +45,7 @@
 
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
-#define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
-#define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
+#define	MZAP_MAX_BLKSZ		SPA_OLD_MAXBLOCKSIZE
 
 #define	ZAP_NEED_CD		(-1U)
 
@@ -70,7 +73,7 @@
 } mzap_ent_t;
 
 #define	MZE_PHYS(zap, mze) \
-	(&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+	(&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
 
 /*
  * The (fat) zap is stored in one object. It is an array of
@@ -104,7 +107,7 @@
  * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
  */
 #define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-	((uint64_t *)(zap)->zap_f.zap_phys) \
+	((uint64_t *)zap_f_phys(zap)) \
 	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
 
 /*
@@ -140,6 +143,7 @@
 typedef struct zap_table_phys zap_table_phys_t;
 
 typedef struct zap {
+	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
 	struct dmu_buf *zap_dbuf;
@@ -149,8 +153,6 @@
 	uint64_t zap_salt;
 	union {
 		struct {
-			zap_phys_t *zap_phys;
-
 			/*
 			 * zap_num_entries_mtx protects
 			 * zap_num_entries
@@ -159,7 +161,6 @@
 			int zap_block_shift;
 		} zap_fat;
 		struct {
-			mzap_phys_t *zap_phys;
 			int16_t zap_num_entries;
 			int16_t zap_num_chunks;
 			int16_t zap_alloc_next;
@@ -168,6 +169,18 @@
 	} zap_u;
 } zap_t;
 
+inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
+
+inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
+
 typedef struct zap_name {
 	zap_t *zn_zap;
 	int zn_key_intlen;
@@ -185,9 +198,9 @@
 
 boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
+void zap_unlockdir(zap_t *zap, void *tag);
+void zap_evict(void *dbu);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
@@ -202,12 +215,13 @@
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
 void fzap_prefetch(zap_name_t *zn);
-int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
-    uint64_t *tooverwrite);
+int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+    refcount_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
+    const void *val, void *tag, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
@@ -217,7 +231,7 @@
 
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx);
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
@@ -83,7 +85,7 @@
  */
 #define	ZAP_LEAF_CHUNK(l, idx) \
 	((zap_leaf_chunk_t *) \
-	((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+	(zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
 #define	ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
 
 typedef enum zap_chunk_type {
@@ -152,13 +154,18 @@
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
+	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
 	int l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
-	zap_leaf_phys_t *l_phys;
 } zap_leaf_t;
 
+inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+	return (l->l_dbuf->db_data);
+}
 
 typedef struct zap_entry_handle {
 	/* Set by zap_leaf and public to ZAP */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFEATURE_H
@@ -27,6 +28,7 @@
 #define	_SYS_ZFEATURE_H
 
 #include <sys/nvpair.h>
+#include <sys/txg.h>
 #include "zfeature_common.h"
 
 #ifdef	__cplusplus
@@ -33,21 +35,38 @@
 extern "C" {
 #endif
 
+#define	VALID_FEATURE_FID(fid)	((fid) >= 0 && (fid) < SPA_FEATURES)
+#define	VALID_FEATURE_OR_NONE(fid)	((fid) == SPA_FEATURE_NONE ||	\
+    VALID_FEATURE_FID(fid))
+
 struct spa;
 struct dmu_tx;
 struct objset;
 
-extern boolean_t feature_is_supported(struct objset *os, uint64_t obj,
-    uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat);
-
 extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
-extern void spa_feature_enable(struct spa *, zfeature_info_t *,
+extern void spa_feature_enable(struct spa *, spa_feature_t,
     struct dmu_tx *);
-extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *);
-extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *);
-extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *);
-extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *);
+extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
+    uint64_t *txg);
+extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
+extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
 
+/*
+ * These functions are only exported for zhack and zdb; normal callers should
+ * use the above interfaces.
+ */
+extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
+extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+    uint64_t *res);
+extern void feature_enable_sync(struct spa *, zfeature_info_t *,
+    struct dmu_tx *);
+extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,
+    struct dmu_tx *);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,7 +25,7 @@
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -94,16 +95,14 @@
 #include <sys/sunddi.h>
 #ifdef illumos
 #include <sys/cyclic.h>
-#else	/* FreeBSD */
-#include <sys/callout.h>
 #endif
-
+#include <sys/callo.h>
+#include <sys/disp.h>
 #include <machine/stdarg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
-#include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 /* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -44,7 +45,7 @@
 
 void zfsctl_create(zfsvfs_t *);
 void zfsctl_destroy(zfsvfs_t *);
-vnode_t *zfsctl_root(znode_t *);
+int zfsctl_root(zfsvfs_t *, int, vnode_t **);
 void zfsctl_init(void);
 void zfsctl_fini(void);
 boolean_t zfsctl_is_node(vnode_t *);
@@ -53,15 +54,10 @@
 int zfsctl_destroy_snapshot(const char *snapname, int force);
 int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
 
-int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp);
-
 int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
 
 #define	ZFSCTL_INO_ROOT		0x1
 #define	ZFSCTL_INO_SNAPDIR	0x2
-#define	ZFSCTL_INO_SHARES	0x3
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
@@ -49,14 +50,17 @@
 #endif
 
 extern int zfs_flags;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
 
-#define	ZFS_DEBUG_DPRINTF	(1<<0)
-#define	ZFS_DEBUG_DBUF_VERIFY	(1<<1)
-#define	ZFS_DEBUG_DNODE_VERIFY	(1<<2)
-#define	ZFS_DEBUG_SNAPNAMES	(1<<3)
-#define	ZFS_DEBUG_MODIFY	(1<<4)
-#define	ZFS_DEBUG_SPA		(1<<5)
-#define	ZFS_DEBUG_ZIO_FREE	(1<<6)
+#define	ZFS_DEBUG_DPRINTF		(1<<0)
+#define	ZFS_DEBUG_DBUF_VERIFY		(1<<1)
+#define	ZFS_DEBUG_DNODE_VERIFY		(1<<2)
+#define	ZFS_DEBUG_SNAPNAMES		(1<<3)
+#define	ZFS_DEBUG_MODIFY		(1<<4)
+#define	ZFS_DEBUG_SPA			(1<<5)
+#define	ZFS_DEBUG_ZIO_FREE		(1<<6)
+#define	ZFS_DEBUG_HISTOGRAM_VERIFY	(1<<7)
 
 #ifdef ZFS_DEBUG
 extern void __dprintf(const char *file, const char *func,
@@ -79,6 +83,7 @@
 extern void zfs_dbgmsg_init(void);
 extern void zfs_dbgmsg_fini(void);
 extern void zfs_dbgmsg(const char *fmt, ...);
+extern void zfs_dbgmsg_print(const char *tag);
 
 #ifdef illumos
 #ifndef _KERNEL

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -48,18 +49,18 @@
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
 
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
     boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
-    pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
     uint_t, znode_t **, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
 extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
 extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,10 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
@@ -79,19 +83,32 @@
  * Feature flags for zfs send streams (flags in drr_versioninfo)
  */
 
-#define	DMU_BACKUP_FEATURE_DEDUP	(0x1)
-#define	DMU_BACKUP_FEATURE_DEDUPPROPS	(0x2)
-#define	DMU_BACKUP_FEATURE_SA_SPILL	(0x4)
+#define	DMU_BACKUP_FEATURE_DEDUP		(1 << 0)
+#define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1 << 1)
+#define	DMU_BACKUP_FEATURE_SA_SPILL		(1 << 2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define	DMU_BACKUP_FEATURE_EMBED_DATA		(1 << 16)
+#define	DMU_BACKUP_FEATURE_EMBED_DATA_LZ4	(1 << 17)
+/* flag #18 is reserved for a Delphix feature */
+#define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1 << 19)
+#define	DMU_BACKUP_FEATURE_RESUMING		(1 << 20)
 
 /*
  * Mask of all supported backup features
  */
 #define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
-		DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+    DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+    DMU_BACKUP_FEATURE_RESUMING | \
+    DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
 
+typedef enum dmu_send_resume_token_version {
+	ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
 /*
  * The drr_versioninfo field of the dmu_replay_record has the
  * following layout:
@@ -111,8 +128,22 @@
 
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
+/*
+ * Send stream flags.  Bits 24-31 are reserved for vendor-specific
+ * implementations and should not be used.
+ */
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process.  This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible.  We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define	DRR_FLAG_FREERECORDS	(1<<2)
 
 /*
  * flags in the drr_checksumflags field in the DRR_WRITE and
@@ -210,7 +241,7 @@
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
-		DRR_SPILL, DRR_NUMTYPES
+		DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
 	} drr_type;
 	uint32_t drr_payloadlen;
 	union {
@@ -222,6 +253,35 @@
 		struct drr_free drr_free;
 		struct drr_write_byref drr_write_byref;
 		struct drr_spill drr_spill;
+		struct drr_write_embedded {
+			uint64_t drr_object;
+			uint64_t drr_offset;
+			/* logical length, should equal blocksize */
+			uint64_t drr_length;
+			uint64_t drr_toguid;
+			uint8_t drr_compression;
+			uint8_t drr_etype;
+			uint8_t drr_pad[6];
+			uint32_t drr_lsize; /* uncompressed size of payload */
+			uint32_t drr_psize; /* compr. (real) size of payload */
+			/* (possibly compressed) content follows */
+		} drr_write_embedded;
+
+		/*
+		 * Nore: drr_checksum is overlaid with all record types
+		 * except DRR_BEGIN.  Therefore its (non-pad) members
+		 * must not overlap with members from the other structs.
+		 * We accomplish this by putting its members at the very
+		 * end of the struct.
+		 */
+		struct drr_checksum {
+			uint64_t drr_pad[34];
+			/*
+			 * fletcher-4 checksum of everything preceding the
+			 * checksum.
+			 */
+			zio_cksum_t drr_checksum;
+		} drr_checksum;
 	} drr_u;
 } dmu_replay_record_t;
 
@@ -256,6 +316,7 @@
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
+	uint64_t	zi_nlanes;
 	uint32_t	zi_cmd;
 	uint32_t	zi_pad;
 } zinject_record_t;
@@ -293,6 +354,12 @@
 	ZFS_CASE_MIXED
 } zfs_case_t;
 
+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel.  Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
 	uint64_t	zc_nvlist_src;		/* really (char *) */
@@ -322,14 +389,16 @@
 	zfs_share_t	zc_share;
 	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
+	dmu_replay_record_t zc_begin_record;
 	zinject_record_t zc_inject_record;
-	boolean_t	zc_defer_destroy;
-	boolean_t	zc_temphold;
+	uint32_t	zc_defer_destroy;
+	uint32_t	zc_flags;
 	uint64_t	zc_action_handle;
 	int		zc_cleanup_fd;
 	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
+	uint8_t		zc_pad3[3];
+	boolean_t	zc_resumable;
+	uint32_t	zc_pad4;
 	uint64_t	zc_sendobj;
 	uint64_t	zc_fromobj;
 	uint64_t	zc_createtxg;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -64,11 +65,11 @@
 	int		z_norm;		/* normalization flags */
 	boolean_t	z_atime;	/* enable atimes mount option */
 	boolean_t	z_unmounted;	/* unmounted */
-	rrwlock_t	z_teardown_lock;
+	rrmlock_t	z_teardown_lock;
 	krwlock_t	z_teardown_inactive_lock;
 	list_t		z_all_znodes;	/* all vnodes in the fs */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
-	vnode_t		*z_ctldir;	/* .zfs directory pointer */
+	struct zfsctl_root	*z_ctldir;	/* .zfs directory pointer */
 	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
@@ -75,6 +76,7 @@
 	boolean_t	z_use_fuids;	/* version allows fuids */
 	boolean_t	z_replay;	/* set during ZIL replay */
 	boolean_t	z_use_sa;	/* version allow system attributes */
+	boolean_t	z_use_namecache;/* make use of FreeBSD name cache */
 	uint64_t	z_version;	/* ZPL version */
 	uint64_t	z_shares_dir;	/* hidden shares dir */
 	kmutex_t	z_lock;
@@ -110,7 +112,7 @@
 } zfid_short_t;
 
 /*
- * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
+ * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
  * (including the length field).  This makes files under .zfs/snapshot
  * accessible by NFSv3 and NFSv4, but not NFSv2.
  *
@@ -120,10 +122,13 @@
  *	6 bytes		object number (48 bits)
  *	4 bytes		generation number (32 bits)
  *	6 bytes		objset id (48 bits)
- *	4 bytes		currently just zero (32 bits)
+ *	4 bytes[**]	currently just zero (32 bits)
  *
  * We reserve only 48 bits for the object number and objset id, as these are
  * the limits currently defined and imposed by the DMU.
+ *
+ * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
+ * [**] 2 bytes on FreeBSD for the above reason.
  */
 typedef struct zfid_long {
 	zfid_short_t	z_fid;
@@ -138,7 +143,7 @@
 extern int zfs_super_owner;
 
 extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds);
 extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valuep);
 extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
@@ -153,7 +158,6 @@
 extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
 extern void zfsvfs_free(zfsvfs_t *zfsvfs);
 extern int zfs_check_global_label(const char *dsname, const char *hexsl);
-extern int zfs_vnode_lock(vnode_t *vp, int flags);
 
 #ifdef _KERNEL
 extern void zfsvfs_update_fromname(const char *oldname, const char *newname);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_FS_ZFS_ZNODE_H
@@ -133,20 +135,7 @@
 #define	ZFS_SHARES_DIR		"SHARES"
 #define	ZFS_SA_ATTRS		"SA_ATTRS"
 
-#define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
-
 /*
- * Path component length
- *
- * The generic fs code uses MAXNAMELEN to represent
- * what the largest component length is.  Unfortunately,
- * this length includes the terminating NULL.  ZFS needs
- * to tell the users via pathconf() and statvfs() what the
- * true maximum length of a component is, excluding the NULL.
- */
-#define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
-
-/*
  * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
  * the directory entries.
  */
@@ -182,10 +171,12 @@
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
+#ifdef illumos
 	kmutex_t	z_lock;		/* znode modification lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+#endif
 	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
@@ -237,7 +228,7 @@
 {
 	vnode_t *vp = zp->z_vnode;
 
-	ASSERT(vp == NULL || vp->v_data == NULL || vp->v_data == zp);
+	ASSERT(vp != NULL && vp->v_data == zp);
 	return (vp);
 }
 static __inline znode_t *
@@ -245,7 +236,7 @@
 {
 	znode_t *zp = (znode_t *)vp->v_data;
 
-	ASSERT(zp == NULL || zp->z_vnode == NULL || zp->z_vnode == vp);
+	ASSERT(zp != NULL && zp->z_vnode == vp);
 	return (zp);
 }
 #else
@@ -256,7 +247,7 @@
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define	ZFS_ENTER(zfsvfs) \
 	{ \
-		rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+		rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
 		if ((zfsvfs)->z_unmounted) { \
 			ZFS_EXIT(zfsvfs); \
 			return (EIO); \
@@ -263,12 +254,8 @@
 		} \
 	}
 
-/* Called on entry to each ZFS vnode and vfs operation that can not return EIO */
-#define	ZFS_ENTER_NOERROR(zfsvfs) \
-	rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
-
 /* Must be called before exiting the vop */
-#define	ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+#define	ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
 
 /* Verifies the znode is valid */
 #define	ZFS_VERIFY_ZP(zp) \
@@ -369,6 +356,8 @@
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
 extern int zfsfstype;
 
+extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf);
+
 #endif /* _KERNEL */
 
 extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -37,6 +39,9 @@
 extern "C" {
 #endif
 
+struct dsl_pool;
+struct dsl_dataset;
+
 /*
  * Intent log format:
  *
@@ -90,7 +95,6 @@
 } zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
-#define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
 
 /*
  * The words of a log block checksum.
@@ -366,7 +370,6 @@
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
-	uint64_t	itx_sod;	/* record size on disk */
 	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
@@ -402,11 +405,14 @@
 extern void	zil_itx_destroy(itx_t *itx);
 extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
+extern void	zil_async_to_sync(zilog_t *zilog, uint64_t oid);
 extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
-extern int	zil_claim(const char *osname, void *txarg);
-extern int	zil_check_log_chain(const char *osname, void *txarg);
+extern int	zil_claim(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *txarg);
+extern int 	zil_check_log_chain(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -41,6 +43,7 @@
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
@@ -61,7 +64,6 @@
 typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
-	uint64_t	itxg_sod;	/* total size on disk for this txg */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
 } itxg_t;
 
@@ -119,7 +121,6 @@
 	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
@@ -139,8 +140,10 @@
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
-#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
+#define	ZIL_MAX_COPIED_DATA \
+    ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,7 +22,8 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -28,6 +30,7 @@
 #ifndef _ZIO_H
 #define	_ZIO_H
 
+#include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -79,9 +82,21 @@
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_NOPARITY,
+#ifdef illumos
+	ZIO_CHECKSUM_SHA512,
+	ZIO_CHECKSUM_SKEIN,
+	ZIO_CHECKSUM_EDONR,
+#endif
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
@@ -111,15 +126,25 @@
 	ZIO_COMPRESS_FUNCTIONS
 };
 
-/* N.B. when altering this value, also change BOOTFS_COMPRESS_VALID below */
-#define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
-#define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
+#define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
+
+#define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
+
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
-	((compress) == ZIO_COMPRESS_ON &&		\
-	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||	\
+	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
@@ -126,24 +151,6 @@
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
-#define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
-#define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
-#define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
-#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[3])
-#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[4])
-#define	ZIO_PRIORITY_AGG		(zio_priority_table[5])
-#define	ZIO_PRIORITY_FREE		(zio_priority_table[6])
-#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[7])
-#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[8])
-#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[9])
-#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[10])
-#define	ZIO_PRIORITY_DDT_PREFETCH	(zio_priority_table[11])
-#define	ZIO_PRIORITY_TRIM		(zio_priority_table[12])
-#define	ZIO_PRIORITY_TABLE_SIZE		13
-
-#define	ZIO_PIPELINE_CONTINUE		0x100
-#define	ZIO_PIPELINE_STOP		0x101
-
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
@@ -155,6 +162,7 @@
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
 	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
+	ZIO_FLAG_PHYSICAL	= 1 << 6,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
@@ -161,13 +169,14 @@
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
-	ZIO_FLAG_CANFAIL	= 1 << 6,	/* must be first for INHERIT */
-	ZIO_FLAG_SPECULATIVE	= 1 << 7,
-	ZIO_FLAG_CONFIG_WRITER	= 1 << 8,
-	ZIO_FLAG_DONT_RETRY	= 1 << 9,
-	ZIO_FLAG_DONT_CACHE	= 1 << 10,
-	ZIO_FLAG_NODATA		= 1 << 11,
-	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 12,
+	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 8,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
+	ZIO_FLAG_DONT_RETRY	= 1 << 10,
+	ZIO_FLAG_DONT_CACHE	= 1 << 11,
+	ZIO_FLAG_NODATA		= 1 << 12,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
+	ZIO_FLAG_IO_ALLOCATING	= 1 << 14,
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
@@ -175,10 +184,10 @@
 	/*
 	 * Flags inherited by vdev children.
 	 */
-	ZIO_FLAG_IO_RETRY	= 1 << 13,	/* must be first for INHERIT */
-	ZIO_FLAG_PROBE		= 1 << 14,
-	ZIO_FLAG_TRYHARD	= 1 << 15,
-	ZIO_FLAG_OPTIONAL	= 1 << 16,
+	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 16,
+	ZIO_FLAG_TRYHARD	= 1 << 17,
+	ZIO_FLAG_OPTIONAL	= 1 << 18,
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
@@ -185,16 +194,17 @@
 	/*
 	 * Flags not inherited by any children.
 	 */
-	ZIO_FLAG_DONT_QUEUE	= 1 << 17,	/* must be first for INHERIT */
-	ZIO_FLAG_DONT_PROPAGATE	= 1 << 18,
-	ZIO_FLAG_IO_BYPASS	= 1 << 19,
-	ZIO_FLAG_IO_REWRITE	= 1 << 20,
-	ZIO_FLAG_RAW		= 1 << 21,
-	ZIO_FLAG_GANG_CHILD	= 1 << 22,
-	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24,
-	ZIO_FLAG_NOPWRITE	= 1 << 25,
-	ZIO_FLAG_REEXECUTED	= 1 << 26,
+	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
+	ZIO_FLAG_IO_BYPASS	= 1 << 21,
+	ZIO_FLAG_IO_REWRITE	= 1 << 22,
+	ZIO_FLAG_RAW		= 1 << 23,
+	ZIO_FLAG_GANG_CHILD	= 1 << 24,
+	ZIO_FLAG_DDT_CHILD	= 1 << 25,
+	ZIO_FLAG_GODFATHER	= 1 << 26,
+	ZIO_FLAG_NOPWRITE	= 1 << 27,
+	ZIO_FLAG_REEXECUTED	= 1 << 28,
+	ZIO_FLAG_DELEGATED	= 1 << 29,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -211,6 +221,9 @@
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_CANFAIL)
 
+#define	ZIO_CHILD_BIT(x)		(1 << (x))
+#define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1 << (x)))
+
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
@@ -219,6 +232,14 @@
 	ZIO_CHILD_TYPES
 };
 
+#define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
+#define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
+#define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
+#define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
+#define	ZIO_CHILD_ALL_BITS					\
+	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | 		\
+	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
+
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
@@ -235,8 +256,8 @@
 
 typedef void zio_done_func_t(zio_t *zio);
 
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
+extern boolean_t zio_dva_throttle_enabled;
+extern const char *zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
@@ -247,20 +268,21 @@
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
- * Note: this structure is passed between userland and the kernel.
- * Therefore it must not change size or alignment between 32/64 bit
- * compilation options.
+ * Note: this structure is passed between userland and the kernel, and is
+ * stored on disk (by virtue of being incorporated into other on-disk
+ * structures, e.g. dsl_scan_phys_t).
  */
-typedef struct zbookmark {
+typedef struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
-} zbookmark_t;
+} zbookmark_phys_t;
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
@@ -279,6 +301,9 @@
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
+#define	ZB_DNODE_LEVEL		(-3LL)
+#define	ZB_DNODE_BLKID		(0ULL)
+
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@@ -404,12 +429,12 @@
 
 struct zio {
 	/* Core information about this I/O */
-	zbookmark_t	io_bookmark;
+	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	int		io_cmd;
-	uint8_t		io_priority;
+	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
@@ -419,12 +444,13 @@
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
-	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
+	zio_done_func_t	*io_children_ready;
+	zio_done_func_t	*io_physdone;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
@@ -442,11 +468,12 @@
 	const zio_vsd_ops_t *io_vsd_ops;
 
 	uint64_t	io_offset;
-	uint64_t	io_deadline;
 	hrtime_t	io_timestamp;
+	hrtime_t	io_queued_timestamp;
+	hrtime_t	io_target_timestamp;
+	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
-	avl_node_t	io_deadline_node;
-	avl_tree_t	*io_vdev_tree;
+	avl_node_t	io_alloc_node;
 
 	/* Internal pipeline state */
 	enum zio_flag	io_flags;
@@ -455,10 +482,12 @@
 	enum zio_flag	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
+	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	io_child_count;
+	uint64_t	io_phys_children;
 	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
@@ -472,14 +501,15 @@
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
-#ifdef _KERNEL
-	/* FreeBSD only. */
-	struct ostask	io_task;
-#endif
+	/* Taskq dispatching state */
+	taskq_ent_t	io_tqent;
+
 	avl_node_t	io_trim_node;
 	list_node_t	io_trim_link;
 };
 
+extern int zio_timestamp_compare(const void *, const void *);
+
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, enum zio_flag flags);
 
@@ -488,16 +518,18 @@
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *priv,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *priv,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    zio_done_func_t *ready, zio_done_func_t *children_ready,
+    zio_done_func_t *physdone, zio_done_func_t *done,
+    void *priv, zio_priority_t priority, enum zio_flag flags,
+    const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *priv,
-    int priority, enum zio_flag flags, zbookmark_t *zb);
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     boolean_t nopwrite);
@@ -510,23 +542,23 @@
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
-    int priority, enum zio_flag flags);
+    zio_priority_t priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *priv, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *priv, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *priv, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *priv, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, uint64_t size, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
@@ -537,25 +569,33 @@
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
+extern void zio_delay_init(zio_t *zio);
+extern void zio_delay_interrupt(zio_t *zio);
 
-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
+extern void *zio_buf_alloc_nowait(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
+extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+    uint64_t bufsize, zio_transform_func_t *transform);
+extern void zio_pop_transforms(zio_t *zio);
+
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, void *data, uint64_t size, int type, int priority,
-    enum zio_flag flags, zio_done_func_t *done, void *priv);
+    uint64_t offset, void *data, uint64_t size, int type,
+    zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    void *data, uint64_t size, int type, int priority,
+    void *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
@@ -569,8 +609,8 @@
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
-extern enum zio_compress zio_compress_select(enum zio_compress child,
-    enum zio_compress parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+    enum zio_compress child, enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
@@ -597,7 +637,7 @@
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
-extern uint64_t zio_handle_io_delay(zio_t *zio);
+extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
@@ -618,9 +658,11 @@
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
-/* zbookmark functions */
-boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
-    const zbookmark_t *zb1, const zbookmark_t *zb2);
+/* zbookmark_phys functions */
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+    uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
@@ -26,6 +29,7 @@
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -34,17 +38,34 @@
 /*
  * Signature for checksum functions.
  */
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+typedef void zio_checksum_t(const void *data, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
 
+typedef enum zio_checksum_flags {
+	/* Strong enough for metadata? */
+	ZCHECKSUM_FLAG_METADATA = (1 << 1),
+	/* ZIO embedded checksum */
+	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+	/* Strong enough for dedup (without verification)? */
+	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+	/* Uses salt value */
+	ZCHECKSUM_FLAG_SALTED = (1 << 4),
+	/* Strong enough for nopwrite? */
+	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
+
 /*
  * Information about each checksum function.
  */
 typedef struct zio_checksum_info {
-	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
-	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_eck;		/* uses zio embedded checksum? */
-	int		ci_dedup;	/* strong enough for dedup? */
-	char		*ci_name;	/* descriptive name */
+	/* checksum function for each byteorder */
+	zio_checksum_t			*ci_func[2];
+	zio_checksum_tmpl_init_t	*ci_tmpl_init;
+	zio_checksum_tmpl_free_t	*ci_tmpl_free;
+	zio_checksum_flags_t		ci_flags;
+	char				*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
 typedef struct zio_bad_cksum {
@@ -62,11 +83,33 @@
  * Checksum routines.
  */
 extern zio_checksum_t zio_checksum_SHA256;
+#ifdef illumos
+extern zio_checksum_t zio_checksum_SHA512_native;
+extern zio_checksum_t zio_checksum_SHA512_byteswap;
 
+/* Skein */
+extern zio_checksum_t zio_checksum_skein_native;
+extern zio_checksum_t zio_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+
+/* Edon-R */
+extern zio_checksum_t zio_checksum_edonr_native;
+extern zio_checksum_t zio_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+#endif
+
+extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
+extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
 extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -70,6 +71,8 @@
     int level);
 extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
+extern void lz4_init(void);
+extern void lz4_fini(void);
 extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
 extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
@@ -83,6 +86,12 @@
 extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
     size_t s_len, size_t d_len);
 
+/*
+ * Module lifetime management.
+ */
+extern void zio_compress_init(void);
+extern void zio_compress_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _ZIO_IMPL_H
@@ -38,7 +39,7 @@
 #endif
 
 /*
- * XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
  *
  * The ZFS I/O pipeline is comprised of various stages which are defined
  * in the zio_stage enum below. The individual stages are used to construct
@@ -108,35 +109,37 @@
 	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
 
 	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 2,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 3,	/* RWF-- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 4,	/* -W--- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
+	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 6,	/* -W--- */
 
-	ZIO_STAGE_NOP_WRITE		= 1 << 6,	/* -W--- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 7,	/* -W--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 7,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 8,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 9,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 10,	/* --F-- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 8,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 9,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 10,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 11,	/* --F-- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 11,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 12,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 12,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 13,	/* RWFC- */
 
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 13,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 14,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 15,	/* ---C- */
+	ZIO_STAGE_DVA_THROTTLE		= 1 << 14,	/* -W--- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 15,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 16,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 17,	/* ---C- */
 
-	ZIO_STAGE_READY			= 1 << 16,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 18,	/* RWFCI */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 17,	/* RWF-I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 18,	/* RWF-- */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 19,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 19,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 20,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 21,	/* RWF-I */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 20,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 22,	/* R---- */
 
-	ZIO_STAGE_DONE			= 1 << 21	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 23	/* RWFCI */
 };
 
 #define	ZIO_INTERLOCK_STAGES			\
@@ -187,22 +190,27 @@
 
 #define	ZIO_REWRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_WRITE_BP_INIT)
 
 #define	ZIO_WRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
 	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)
 
 #define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)
 
 #define	ZIO_DDT_WRITE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_WRITE_BP_INIT |		\
 	ZIO_STAGE_ISSUE_ASYNC |			\
-	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_CHECKSUM_GENERATE |		\
 	ZIO_STAGE_DDT_WRITE)
 
@@ -213,11 +221,12 @@
 #define	ZIO_FREE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_FREE_BP_INIT |		\
-	ZIO_STAGE_ISSUE_ASYNC |			\
-	ZIO_STAGE_DVA_FREE |			\
-	ZIO_STAGE_VDEV_IO_START |		\
-	ZIO_STAGE_VDEV_IO_ASSESS)
+	ZIO_STAGE_DVA_FREE)
 
+#define	ZIO_FREE_PHYS_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES)
+
 #define	ZIO_DDT_FREE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_FREE_BP_INIT |		\

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZRLOCK_H
@@ -44,12 +46,8 @@
 
 extern void zrl_init(zrlock_t *);
 extern void zrl_destroy(zrlock_t *);
-#ifdef	ZFS_DEBUG
-#define	zrl_add(_z)	zrl_add_debug((_z), __func__)
-extern void zrl_add_debug(zrlock_t *, const char *);
-#else
-extern void zrl_add(zrlock_t *);
-#endif
+#define	zrl_add(_z)	zrl_add_impl((_z), __func__)
+extern void zrl_add_impl(zrlock_t *, const char *);
 extern void zrl_remove(zrlock_t *);
 extern int zrl_tryenter(zrlock_t *);
 extern void zrl_exit(zrlock_t *);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -43,9 +44,9 @@
 extern int zvol_create_minor(const char *);
 extern int zvol_remove_minor(const char *);
 extern void zvol_remove_minors(const char *);
-extern int zvol_set_volsize(const char *, major_t, uint64_t);
+extern int zvol_set_volsize(const char *, uint64_t);
 
-#ifdef sun
+#ifdef illumos
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
 extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
@@ -54,7 +55,7 @@
 extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
 extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
 extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif	/* sun */
+#endif	/* illumos */
 extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
     int *rvalp);
 extern int zvol_busy(void);
@@ -61,7 +62,7 @@
 extern void zvol_init(void);
 extern void zvol_fini(void);
 
-#ifdef sun
+#ifdef illumos
 extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
     void **rl_hdl, void **bonus_hdl);
@@ -69,9 +70,9 @@
 extern int zvol_get_volume_wce(void *minor_hdl);
 extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
     ssize_t resid, boolean_t sync);
-#endif	/* sun */
+#endif	/* illumos */
 
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 extern int zvol_create_minors(const char *name);
 extern void zvol_rename_minors(const char *oldname, const char *newname);
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -41,17 +41,20 @@
 #define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
  	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
 
-#define TRIM_MAP_SINC(tm, size)					\
-	atomic_add_64(&(tm)->tm_bytes, (size))
+/* Maximal segment size for ATA TRIM. */
+#define TRIM_MAP_SIZE_FACTOR	(512 << 16)
 
-#define TRIM_MAP_SDEC(tm, size)					\
-	atomic_add_64(&(tm)->tm_bytes, -(size))
+#define TRIM_MAP_SEGS(size)	(1 + (size) / TRIM_MAP_SIZE_FACTOR)
 
-#define TRIM_MAP_QINC(tm)					\
-	atomic_inc_64(&(tm)->tm_pending);			\
+#define TRIM_MAP_ADD(tm, ts)	do {				\
+	list_insert_tail(&(tm)->tm_head, (ts));			\
+	(tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
 
-#define TRIM_MAP_QDEC(tm)					\
-	atomic_dec_64(&(tm)->tm_pending);
+#define TRIM_MAP_REM(tm, ts)	do {				\
+	list_remove(&(tm)->tm_head, (ts));			\
+	(tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
 
 typedef struct trim_map {
 	list_t		tm_head;		/* List of segments sorted by txg. */
@@ -61,7 +64,6 @@
 	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
 	kmutex_t	tm_lock;
 	uint64_t	tm_pending;		/* Count of pending TRIMs. */
-	uint64_t	tm_bytes;		/* Total size in bytes of queued TRIMs. */
 } trim_map_t;
 
 typedef struct trim_seg {
@@ -75,13 +77,10 @@
 
 extern boolean_t zfs_trim_enabled;
 
-static u_int trim_txg_delay = 32;
-static u_int trim_timeout = 30;
-static u_int trim_max_interval = 1;
-/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
-static uint64_t trim_vdev_max_bytes = 2147483648;
-/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */	
-static u_int trim_vdev_max_pending = 64;
+static u_int trim_txg_delay = 32;	/* Keep deleted data up to 32 TXG */
+static u_int trim_timeout = 30;		/* Keep deleted data up to 30s */
+static u_int trim_max_interval = 1;	/* 1s delays between TRIMs */
+static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
@@ -100,11 +99,6 @@
     "Maximum interval between TRIM queue processing (seconds)");
 
 SYSCTL_DECL(_vfs_zfs_vdev);
-TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
-SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
-    &trim_vdev_max_bytes, 0,
-    "Maximum pending TRIM bytes for a vdev");
-
 TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
     &trim_vdev_max_pending, 0,
@@ -156,11 +150,9 @@
 {
 	trim_map_t *tm;
 
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
+		vd->vdev_ops->vdev_op_leaf);
 
-	if (!zfs_trim_enabled)
-		return;
-
 	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
 	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&tm->tm_head, sizeof (trim_seg_t),
@@ -201,10 +193,8 @@
 	mutex_enter(&tm->tm_lock);
 	while ((ts = list_head(&tm->tm_head)) != NULL) {
 		avl_remove(&tm->tm_queued_frees, ts);
-		list_remove(&tm->tm_head, ts);
+		TRIM_MAP_REM(tm, ts);
 		kmem_free(ts, sizeof (*ts));
-		TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
-		TRIM_MAP_QDEC(tm);
 	}
 	mutex_exit(&tm->tm_lock);
 
@@ -249,27 +239,27 @@
 	merge_after = (ts_after != NULL && ts_after->ts_start == end);
 
 	if (merge_before && merge_after) {
-		TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
-		TRIM_MAP_QDEC(tm);
 		avl_remove(&tm->tm_queued_frees, ts_before);
-		list_remove(&tm->tm_head, ts_before);
+		TRIM_MAP_REM(tm, ts_before);
+		TRIM_MAP_REM(tm, ts_after);
 		ts_after->ts_start = ts_before->ts_start;
 		ts_after->ts_txg = txg;
 		ts_after->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_after);
 		kmem_free(ts_before, sizeof (*ts_before));
 	} else if (merge_before) {
-		TRIM_MAP_SINC(tm, end - ts_before->ts_end);
+		TRIM_MAP_REM(tm, ts_before);
 		ts_before->ts_end = end;
 		ts_before->ts_txg = txg;
 		ts_before->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_before);
 	} else if (merge_after) {
-		TRIM_MAP_SINC(tm, ts_after->ts_start - start);
+		TRIM_MAP_REM(tm, ts_after);
 		ts_after->ts_start = start;
 		ts_after->ts_txg = txg;
 		ts_after->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_after);
 	} else {
-		TRIM_MAP_SINC(tm, end - start);
-		TRIM_MAP_QINC(tm);
 		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
 		ts->ts_start = start;
 		ts->ts_end = end;
@@ -276,7 +266,7 @@
 		ts->ts_txg = txg;
 		ts->ts_time = time;
 		avl_insert(&tm->tm_queued_frees, ts, where);
-		list_insert_tail(&tm->tm_head, ts);
+		TRIM_MAP_ADD(tm, ts);
 	}
 }
 
@@ -292,7 +282,7 @@
 	left_over = (ts->ts_start < start);
 	right_over = (ts->ts_end > end);
 
-	TRIM_MAP_SDEC(tm, end - start);
+	TRIM_MAP_REM(tm, ts);
 	if (left_over && right_over) {
 		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
 		nts->ts_start = end;
@@ -301,16 +291,16 @@
 		nts->ts_time = ts->ts_time;
 		ts->ts_end = start;
 		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
-		list_insert_after(&tm->tm_head, ts, nts);
-		TRIM_MAP_QINC(tm);
+		TRIM_MAP_ADD(tm, ts);
+		TRIM_MAP_ADD(tm, nts);
 	} else if (left_over) {
 		ts->ts_end = start;
+		TRIM_MAP_ADD(tm, ts);
 	} else if (right_over) {
 		ts->ts_start = end;
+		TRIM_MAP_ADD(tm, ts);
 	} else {
 		avl_remove(&tm->tm_queued_frees, ts);
-		list_remove(&tm->tm_head, ts);
-		TRIM_MAP_QDEC(tm);
 		kmem_free(ts, sizeof (*ts));
 	}
 }
@@ -429,7 +419,8 @@
  *    the first element's time is not greater than time argument
  */
 static trim_seg_t *
-trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
+trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
+    boolean_t force)
 {
 	trim_seg_t *ts;
 
@@ -438,9 +429,7 @@
 
 	ts = list_head(&tm->tm_head);
 	if (ts != NULL && ts->ts_txg <= txgsafe &&
-	    (ts->ts_txg <= txg || ts->ts_time <= time ||
-	    tm->tm_bytes > trim_vdev_max_bytes ||
-	    tm->tm_pending > trim_vdev_max_pending))
+	    (ts->ts_txg <= txg || ts->ts_time <= time || force))
 		return (ts);
 	return (NULL);
 }
@@ -450,7 +439,8 @@
 {
 	trim_map_t *tm = vd->vdev_trimmap;
 	trim_seg_t *ts;
-	uint64_t size, txgtarget, txgsafe;
+	uint64_t size, offset, txgtarget, txgsafe;
+	int64_t hard, soft;
 	hrtime_t timelimit;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -458,7 +448,7 @@
 	if (tm == NULL)
 		return;
 
-	timelimit = gethrtime() - trim_timeout * NANOSEC;
+	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
 	if (vd->vdev_isl2cache) {
 		txgsafe = UINT64_MAX;
 		txgtarget = UINT64_MAX;
@@ -471,16 +461,31 @@
 	}
 
 	mutex_enter(&tm->tm_lock);
+	hard = 0;
+	if (tm->tm_pending > trim_vdev_max_pending)
+		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
+	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
 	/* Loop until we have sent all outstanding free's */
-	while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
+	while (soft > 0 &&
+	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
 	    != NULL) {
-		list_remove(&tm->tm_head, ts);
+		TRIM_MAP_REM(tm, ts);
 		avl_remove(&tm->tm_queued_frees, ts);
 		avl_add(&tm->tm_inflight_frees, ts);
 		size = ts->ts_end - ts->ts_start;
-		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
-		TRIM_MAP_SDEC(tm, size);
-		TRIM_MAP_QDEC(tm);
+		offset = ts->ts_start;
+		/*
+		 * We drop the lock while we call zio_nowait as the IO
+		 * scheduler can result in a different IO being run e.g.
+		 * a write which would result in a recursive lock.
+		 */
+		mutex_exit(&tm->tm_lock);
+
+		zio_nowait(zio_trim(zio, spa, vd, offset, size));
+
+		soft -= TRIM_MAP_SEGS(size);
+		hard -= TRIM_MAP_SEGS(size);
+		mutex_enter(&tm->tm_lock);
 	}
 	mutex_exit(&tm->tm_lock);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org>
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -46,7 +46,7 @@
  * either be processing, or blocked waiting to enter the next state. There may
  * be up to three active txgs, and there is always a txg in the open state
  * (though it may be blocked waiting to enter the quiescing state). In broad
- * strokes, transactions — operations that change in-memory structures — are
+ * strokes, transactions -- operations that change in-memory structures -- are
  * accepted into the txg in the open state, and are completed while the txg is
  * in the open or quiescing states. The accumulated changes are written to
  * disk in the syncing state.
@@ -54,7 +54,7 @@
  * Open
  *
  * When a new txg becomes active, it first enters the open state. New
- * transactions — updates to in-memory structures — are assigned to the
+ * transactions -- updates to in-memory structures -- are assigned to the
  * currently open txg. There is always a txg in the open state so that ZFS can
  * accept new changes (though the txg may refuse new changes if it has hit
  * some limit). ZFS advances the open txg to the next state for a variety of
@@ -242,7 +242,7 @@
 }
 
 static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
@@ -355,7 +355,7 @@
  * On return, the transaction group has reached a stable state in which it can
  * then be passed off to the syncing context.
  */
-static void
+static __noinline void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
@@ -370,7 +370,11 @@
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
+	tx->tx_open_time = gethrtime();
 
+	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
@@ -460,7 +464,8 @@
 
 	start = delta = 0;
 	for (;;) {
-		uint64_t timer, timeout = zfs_txg_timeout * hz;
+		uint64_t timeout = zfs_txg_timeout * hz;
+		uint64_t timer;
 		uint64_t txg;
 
 		/*
@@ -472,7 +477,8 @@
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    tx->tx_quiesced_txg == 0) {
+		    tx->tx_quiesced_txg == 0 &&
+		    dp->dp_dirty_total < zfs_dirty_data_sync) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
@@ -502,6 +508,7 @@
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
+		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -515,6 +522,7 @@
 		mutex_enter(&tx->tx_sync_lock);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
+		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
@@ -564,6 +572,7 @@
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
 		tx->tx_quiesced_txg = txg;
+		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
@@ -570,15 +579,15 @@
 }
 
 /*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiescing or quiesced.
- * Abort the delay if this txg stalls or enters the quiescing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
  */
 void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	clock_t timeout = ddi_get_lbolt() + ticks;
+	hrtime_t start = gethrtime();
 
 	/* don't delay if this txg could transition to quiescing immediately */
 	if (tx->tx_open_txg > txg ||
@@ -591,10 +600,11 @@
 		return;
 	}
 
-	while (ddi_get_lbolt() < timeout &&
-	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
-		(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
-		    timeout - ddi_get_lbolt());
+	while (gethrtime() - start < delay &&
+	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+		    &tx->tx_sync_lock, delay, resolution, 0);
+	}
 
 	mutex_exit(&tx->tx_sync_lock);
 }
@@ -646,6 +656,28 @@
 	mutex_exit(&tx->tx_sync_lock);
 }
 
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by queiscing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(!dsl_pool_config_held(dp));
+
+	mutex_enter(&tx->tx_sync_lock);
+	if (tx->tx_syncing_txg == 0 &&
+	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
 boolean_t
 txg_stalled(dsl_pool_t *dp)
 {
@@ -696,6 +728,24 @@
 }
 
 /*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately after this
+ * function returns). We don't bother with the lock because it wouldn't change the
+ * semantics.
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+	for (int i = 0; i < TXG_SIZE; i++) {
+		if (!txg_list_empty(tl, i)) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
+
+/*
  * Add an entry to the list (unless it's already on the list).
  * Returns B_TRUE if it was actually added.
  */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -41,10 +41,10 @@
 }
 
 /*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
  */
-int
+boolean_t
 uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
 {
 	ASSERT(ub->ub_txg < txg);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,10 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -38,6 +39,7 @@
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
+#include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
@@ -53,6 +55,90 @@
  * Virtual device management.
  */
 
+/*
+ * The limit for ZFS to automatically increase a top-level vdev's ashift
+ * from logical ashift to physical ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 12 (4096 bytes)
+ *          zfs_max_auto_ashift = 11 (2048 bytes)
+ *          zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 2048 as limited by
+ * zfs_max_auto_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 12 (4096 bytes)
+ *          zfs_max_auto_ashift = 13 (8192 bytes)
+ *          zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * max vdev_physical_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 9 (512 bytes)
+ *          zfs_max_auto_ashift = 13 (8192 bytes)
+ *          zfs_min_auto_ashift = 12 (4096 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * zfs_min_auto_ashift.
+ */
+static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
+static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
+
+static int
+sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_max_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
+		return (EINVAL);
+
+	zfs_max_auto_ashift = val;
+
+	return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_max_auto_ashift, "QU",
+    "Max ashift used when optimising for logical -> physical sectors size on "
+    "new top-level vdevs.");
+
+static int
+sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_min_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
+		return (EINVAL);
+
+	zfs_min_auto_ashift = val;
+
+	return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_min_auto_ashift, "QU",
+    "Min ashift used when creating new top-level vdevs.");
+
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
@@ -72,6 +158,15 @@
 
 
 /*
+ * When a vdev is added, it will be divided into approximately (but no
+ * more than) this number of metaslabs.
+ */
+int metaslabs_per_vdev = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
+    &metaslabs_per_vdev, 0,
+    "When a vdev is added, how many metaslabs the vdev should be divided into");
+
+/*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
@@ -179,6 +274,26 @@
 	return (NULL);
 }
 
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+	int n = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+	return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+	return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
@@ -185,8 +300,9 @@
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
+	spa_t *spa = cvd->vdev_spa;
 
-	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
@@ -326,8 +442,9 @@
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	for (int t = 0; t < DTL_TYPES; t++) {
-		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
 	}
 	txg_list_create(&vd->vdev_ms_list,
@@ -513,7 +630,7 @@
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl_smo.smo_object);
+			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
@@ -529,8 +646,8 @@
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
-		    &vd->vdev_resilvering);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+		    &vd->vdev_resilver_txg);
 
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
@@ -635,12 +752,14 @@
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
-		space_map_unload(&vd->vdev_dtl[t]);
-		space_map_destroy(&vd->vdev_dtl[t]);
+		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
+	mutex_destroy(&vd->vdev_queue_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@@ -747,6 +866,8 @@
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
@@ -778,6 +899,8 @@
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
+	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
@@ -828,9 +951,9 @@
 
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
-	 * in 128k (1 << 17) because it is the current "typical" blocksize.
-	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
-	 * or we will inconsistently account for existing bp's.
+	 * in 128k (1 << 17) because it is the "typical" blocksize.
+	 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+	 * otherwise it would inconsistently account for existing bp's.
 	 */
 	vd->vdev_deflate_ratio = (1 << 17) /
 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
@@ -848,27 +971,20 @@
 	vd->vdev_ms_count = newc;
 
 	for (m = oldc; m < newc; m++) {
-		space_map_obj_t smo = { 0, 0, 0 };
+		uint64_t object = 0;
+
 		if (txg == 0) {
-			uint64_t object = 0;
 			error = dmu_read(mos, vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
-			if (object != 0) {
-				dmu_buf_t *db;
-				error = dmu_bonus_hold(mos, object, FTAG, &db);
-				if (error)
-					return (error);
-				ASSERT3U(db->db_size, >=, sizeof (smo));
-				bcopy(db->db_data, &smo, sizeof (smo));
-				ASSERT3U(smo.smo_object, ==, object);
-				dmu_buf_rele(db, FTAG);
-			}
 		}
-		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
-		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+		error = metaslab_init(vd->vdev_mg, m, object, txg,
+		    &(vd->vdev_ms[m]));
+		if (error)
+			return (error);
 	}
 
 	if (txg == 0)
@@ -896,9 +1012,12 @@
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_passivate(vd->vdev_mg);
-		for (m = 0; m < count; m++)
-			if (vd->vdev_ms[m] != NULL)
-				metaslab_fini(vd->vdev_ms[m]);
+		for (m = 0; m < count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			if (msp != NULL)
+				metaslab_fini(msp);
+		}
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
@@ -955,7 +1074,8 @@
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
-		while ((pio = zio_walk_parents(zio)) != NULL)
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
@@ -1121,7 +1241,8 @@
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
-	uint64_t ashift = 0;
+	uint64_t logical_ashift = 0;
+	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@@ -1132,6 +1253,7 @@
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
+	vd->vdev_notrim = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
@@ -1151,7 +1273,8 @@
 		return (SET_ERROR(ENXIO));
 	}
 
-	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
+	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+	    &logical_ashift, &physical_ashift);
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
@@ -1200,10 +1323,8 @@
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vd->vdev_notrim = B_FALSE;
+	if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
 		trim_map_create(vd);
-	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
@@ -1249,6 +1370,17 @@
 		return (SET_ERROR(EINVAL));
 	}
 
+	vd->vdev_physical_ashift =
+	    MAX(physical_ashift, vd->vdev_physical_ashift);
+	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
+	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
+
+	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_ASHIFT_TOO_BIG);
+		return (EINVAL);
+	}
+
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
@@ -1256,12 +1388,12 @@
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
-		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
 	} else {
 		/*
 		 * Make sure the alignment requirement hasn't increased.
 		 */
-		if (ashift > vd->vdev_top->vdev_ashift) {
+		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+		    vd->vdev_ops->vdev_op_leaf) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
@@ -1292,6 +1424,17 @@
 	}
 
 	/*
+	 * Track the min and max ashift values for normal data devices.
+	 */
+	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+	    !vd->vdev_islog && vd->vdev_aux == NULL) {
+		if (vd->vdev_ashift > spa->spa_max_ashift)
+			spa->spa_max_ashift = vd->vdev_ashift;
+		if (vd->vdev_ashift < spa->spa_min_ashift)
+			spa->spa_min_ashift = vd->vdev_ashift;
+	}
+
+	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
 	 * since this would just restart the scrub we are already doing.
@@ -1553,9 +1696,10 @@
 	}
 
 	/*
-	 * Recursively initialize all labels.
+	 * Recursively load DTLs and initialize all labels.
 	 */
-	if ((error = vdev_label_init(vd, txg, isreplacing ?
+	if ((error = vdev_dtl_load(vd)) != 0 ||
+	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
@@ -1568,13 +1712,41 @@
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	/*
-	 * Aim for roughly 200 metaslabs per vdev.
+	 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
 	 */
-	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
 }
 
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
 void
+vdev_ashift_optimize(vdev_t *vd)
+{
+	if (vd == vd->vdev_top) {
+		if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+			vd->vdev_ashift = MIN(
+			    MAX(zfs_max_auto_ashift, vd->vdev_ashift),
+			    MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
+		} else {
+			/*
+			 * Unusual case where logical ashift > physical ashift
+			 * so we can't cap the calculated ashift based on max
+			 * ashift as that would cause failures.
+			 * We still check if we need to increase it to match
+			 * the min ashift.
+			 */
+			vd->vdev_ashift = MAX(zfs_min_auto_ashift,
+			    vd->vdev_ashift);
+		}
+	}
+}
+
+void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
@@ -1591,6 +1763,16 @@
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
 /*
  * DTLs.
  *
@@ -1632,31 +1814,31 @@
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
-	mutex_enter(sm->sm_lock);
-	if (!space_map_contains(sm, txg, size))
-		space_map_add(sm, txg, size);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	if (!range_tree_contains(rt, txg, size))
+		range_tree_add(rt, txg, size);
+	mutex_exit(rt->rt_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
-	mutex_enter(sm->sm_lock);
-	if (sm->sm_space != 0)
-		dirty = space_map_contains(sm, txg, size);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	if (range_tree_space(rt) != 0)
+		dirty = range_tree_contains(rt, txg, size);
+	mutex_exit(rt->rt_lock);
 
 	return (dirty);
 }
@@ -1664,17 +1846,89 @@
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
-	mutex_enter(sm->sm_lock);
-	empty = (sm->sm_space == 0);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	empty = (range_tree_space(rt) == 0);
+	mutex_exit(rt->rt_lock);
 
 	return (empty);
 }
 
 /*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+	return (rs->rs_start - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+	return (rs->rs_end);
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+	ASSERT0(scn->scn_phys.scn_errors);
+	ASSERT0(vd->vdev_children);
+
+	if (vd->vdev_state < VDEV_STATE_DEGRADED)
+		return (B_FALSE);
+
+	if (vd->vdev_resilver_txg == 0 ||
+	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+		return (B_TRUE);
+
+	/*
+	 * When a resilver is initiated the scan will assign the scn_max_txg
+	 * value to the highest txg value that exists in all DTLs. If this
+	 * device's max DTL is not part of this scan (i.e. it is not in
+	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
+	 * for excision.
+	 */
+	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
+		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
+		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
  * Reassess DTLs after a config change or scrub completion.
  */
 void
@@ -1697,9 +1951,17 @@
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 		mutex_enter(&vd->vdev_dtl_lock);
+
+		/*
+		 * If we've completed a scan cleanly then determine
+		 * if this vdev should remove any DTLs. We only want to
+		 * excise regions on vdevs that were available during
+		 * the entire duration of this scan.
+		 */
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started ||
-		    (scn && scn->scn_phys.scn_errors == 0))) {
+		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
+		    vdev_dtl_should_excise(vd)) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
@@ -1717,27 +1979,40 @@
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
-			space_map_ref_create(&reftree);
-			space_map_ref_add_map(&reftree,
-			    &vd->vdev_dtl[DTL_MISSING], 1);
-			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
-			space_map_ref_add_map(&reftree,
-			    &vd->vdev_dtl[DTL_SCRUB], 2);
-			space_map_ref_generate_map(&reftree,
-			    &vd->vdev_dtl[DTL_MISSING], 1);
-			space_map_ref_destroy(&reftree);
+			space_reftree_create(&reftree);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_SCRUB], 2);
+			space_reftree_generate_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_destroy(&reftree);
 		}
-		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
-		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
-		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
+		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
-		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
-			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
-			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
-			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
+			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+		/*
+		 * If the vdev was resilvering and no longer has any
+		 * DTLs then reset its resilvering flag and dirty
+		 * the top level so that we persist the change.
+		 */
+		if (vd->vdev_resilver_txg != 0 &&
+		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
+		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+			vd->vdev_resilver_txg = 0;
+			vdev_config_dirty(vd->vdev_top);
+		}
+
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
@@ -1757,47 +2032,56 @@
 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
-		space_map_ref_create(&reftree);
+		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
-			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
+			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
-		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
-		space_map_ref_destroy(&reftree);
+		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
-static int
+int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl_smo;
 	objset_t *mos = spa->spa_meta_objset;
-	dmu_buf_t *db;
-	int error;
+	int error = 0;
 
-	ASSERT(vd->vdev_children == 0);
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+		ASSERT(!vd->vdev_ishole);
 
-	if (smo->smo_object == 0)
-		return (0);
+		error = space_map_open(&vd->vdev_dtl_sm, mos,
+		    vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+		if (error)
+			return (error);
+		ASSERT(vd->vdev_dtl_sm != NULL);
 
-	ASSERT(!vd->vdev_ishole);
+		mutex_enter(&vd->vdev_dtl_lock);
 
-	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
+		/*
+		 * Now that we've opened the space_map we need to update
+		 * the in-core DTL.
+		 */
+		space_map_update(vd->vdev_dtl_sm);
+
+		error = space_map_load(vd->vdev_dtl_sm,
+		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
+		mutex_exit(&vd->vdev_dtl_lock);
+
 		return (error);
+	}
 
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(db->db_data, smo, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		error = vdev_dtl_load(vd->vdev_child[c]);
+		if (error != 0)
+			break;
+	}
 
-	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
-	    NULL, SM_ALLOC, smo, mos);
-	mutex_exit(&vd->vdev_dtl_lock);
-
 	return (error);
 }
 
@@ -1805,66 +2089,75 @@
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl_smo;
-	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
+	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
-	space_map_t smsync;
-	kmutex_t smlock;
-	dmu_buf_t *db;
+	range_tree_t *rtsync;
+	kmutex_t rtlock;
 	dmu_tx_t *tx;
+	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(!vd->vdev_ishole);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
-	if (vd->vdev_detached) {
-		if (smo->smo_object != 0) {
-			int err = dmu_object_free(mos, smo->smo_object, tx);
-			ASSERT0(err);
-			smo->smo_object = 0;
-		}
+	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		space_map_free(vd->vdev_dtl_sm, tx);
+		space_map_close(vd->vdev_dtl_sm);
+		vd->vdev_dtl_sm = NULL;
+		mutex_exit(&vd->vdev_dtl_lock);
 		dmu_tx_commit(tx);
 		return;
 	}
 
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		vdev_config_dirty(vd->vdev_top);
+	if (vd->vdev_dtl_sm == NULL) {
+		uint64_t new_object;
+
+		new_object = space_map_alloc(mos, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+		    0, -1ULL, 0, &vd->vdev_dtl_lock));
+		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
-	bzero(&smlock, sizeof (smlock));
-	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+	bzero(&rtlock, sizeof(rtlock));
+	mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
 
-	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
-	    &smlock);
+	rtsync = range_tree_create(NULL, NULL, &rtlock);
 
-	mutex_enter(&smlock);
+	mutex_enter(&rtlock);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_walk(sm, space_map_add, &smsync);
+	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
-	space_map_truncate(smo, mos, tx);
-	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
-	space_map_vacate(&smsync, NULL, NULL);
+	space_map_truncate(vd->vdev_dtl_sm, tx);
+	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+	range_tree_vacate(rtsync, NULL, NULL);
 
-	space_map_destroy(&smsync);
+	range_tree_destroy(rtsync);
 
-	mutex_exit(&smlock);
-	mutex_destroy(&smlock);
+	mutex_exit(&rtlock);
+	mutex_destroy(&rtlock);
 
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(smo, db->db_data, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
+	/*
+	 * If the object for the space map has changed then dirty
+	 * the top level so that we update the config.
+	 */
+	if (object != space_map_object(vd->vdev_dtl_sm)) {
+		zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
+		    "new object %llu", txg, spa_name(spa), object,
+		    space_map_object(vd->vdev_dtl_sm));
+		vdev_config_dirty(vd->vdev_top);
+	}
 
 	dmu_tx_commit(tx);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_update(vd->vdev_dtl_sm);
+	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 /*
@@ -1913,14 +2206,11 @@
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
 		    vdev_writeable(vd)) {
-			space_seg_t *ss;
 
-			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
-			thismin = ss->ss_start - 1;
-			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
-			thismax = ss->ss_end;
+			thismin = vdev_dtl_min(vd);
+			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
@@ -2021,29 +2311,45 @@
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
-	if (vd->vdev_dtl_smo.smo_object) {
-		ASSERT0(vd->vdev_dtl_smo.smo_alloc);
-		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
-		vd->vdev_dtl_smo.smo_object = 0;
-	}
+	if (vd->vdev_ms != NULL) {
+		metaslab_group_t *mg = vd->vdev_mg;
 
-	if (vd->vdev_ms != NULL) {
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
-			if (msp == NULL || msp->ms_smo.smo_object == 0)
+			if (msp == NULL || msp->ms_sm == NULL)
 				continue;
 
-			ASSERT0(msp->ms_smo.smo_alloc);
-			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
-			msp->ms_smo.smo_object = 0;
+			mutex_enter(&msp->ms_lock);
+			/*
+			 * If the metaslab was not loaded when the vdev
+			 * was removed then the histogram accounting may
+			 * not be accurate. Update the histogram information
+			 * here so that we ensure that the metaslab group
+			 * and metaslab class are up-to-date.
+			 */
+			metaslab_group_histogram_remove(mg, msp);
+
+			VERIFY0(space_map_allocated(msp->ms_sm));
+			space_map_free(msp->ms_sm, tx);
+			space_map_close(msp->ms_sm);
+			msp->ms_sm = NULL;
+			mutex_exit(&msp->ms_lock);
 		}
+
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			ASSERT0(mg->mg_histogram[i]);
+
 	}
 
 	if (vd->vdev_ms_array) {
 		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
 		vd->vdev_ms_array = 0;
-		vd->vdev_ms_shift = 0;
 	}
 	dmu_tx_commit(tx);
 }
@@ -2205,6 +2511,7 @@
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+	boolean_t postevent = B_FALSE;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
@@ -2214,6 +2521,10 @@
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	postevent =
+	    (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ?
+	    B_TRUE : B_FALSE;
+
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
@@ -2249,6 +2560,10 @@
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
+
+	if (postevent)
+		spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
+
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
@@ -2380,6 +2695,14 @@
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
+	if (vd == rvd) {
+		for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
+			vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
+
+		for (int c = 0; c < spa->spa_spares.sav_count; c++)
+			vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
+	}
+
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
@@ -2464,7 +2787,8 @@
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write && !vd->vdev_ishole);
+	    !vd->vdev_cant_write && !vd->vdev_ishole &&
+	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
@@ -2490,8 +2814,12 @@
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd = vd->vdev_top;
 
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
@@ -2499,8 +2827,22 @@
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf)
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-	vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
-	mutex_exit(&vd->vdev_stat_lock);
+	/*
+	 * Report expandable space on top-level, non-auxillary devices only.
+	 * The expandable space is reported in terms of metaslab sized units
+	 * since that determines how much space the pool can expand.
+	 */
+	if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
+		vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
+	vs->vs_configured_ashift = vd->vdev_top != NULL
+	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+	vs->vs_logical_ashift = vd->vdev_logical_ashift;
+	vs->vs_physical_ashift = vd->vdev_physical_ashift;
+	if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
+		vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+	}
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
@@ -2511,15 +2853,14 @@
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
-			mutex_enter(&vd->vdev_stat_lock);
 			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
 			cvs->vs_scan_removing = cvd->vdev_removing;
-			mutex_exit(&vd->vdev_stat_lock);
 		}
 	}
+	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
@@ -3077,7 +3418,7 @@
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
-#ifdef sun
+#ifdef illumos
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
@@ -3088,8 +3429,6 @@
 		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
 			return (B_FALSE);
 		}
-	} else if (vd->vdev_wholedisk == 1) {
-		return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
@@ -3096,7 +3435,7 @@
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
-#endif	/* sun */
+#endif	/* illumos */
 	return (B_TRUE);
 }
 
@@ -3195,7 +3534,7 @@
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
-		if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+		if (avl_numnodes(&vq->vq_active_tree) > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
@@ -3205,7 +3544,7 @@
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime we panic the system.
 			 */
-			fio = avl_first(&vq->vq_pending_tree);
+			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa)) {
 				zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -115,7 +115,7 @@
 	{ "misses",		KSTAT_DATA_UINT64 }
 };
 
-#define	VDCSTAT_BUMP(stat)	atomic_add_64(&vdc_stats.stat.value.ui64, 1);
+#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -251,7 +251,8 @@
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	while ((pio = zio_walk_parents(fio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
 		vdev_cache_hit(vc, ve, pio);
 
 	if (fio->io_error || ve->ve_missed_update)
@@ -261,9 +262,9 @@
 }
 
 /*
- * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
+ * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
  */
-int
+boolean_t
 vdev_cache_read(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
@@ -275,16 +276,16 @@
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
-		return (SET_ERROR(EINVAL));
+		return (B_FALSE);
 
 	if (zio->io_size > zfs_vdev_cache_max)
-		return (SET_ERROR(EOVERFLOW));
+		return (B_FALSE);
 
 	/*
 	 * If the I/O straddles two or more cache blocks, don't cache it.
 	 */
 	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
-		return (SET_ERROR(EXDEV));
+		return (B_FALSE);
 
 	ASSERT(cache_phase + zio->io_size <= VCBS);
 
@@ -296,7 +297,7 @@
 	if (ve != NULL) {
 		if (ve->ve_missed_update) {
 			mutex_exit(&vc->vc_lock);
-			return (SET_ERROR(ESTALE));
+			return (B_FALSE);
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
@@ -304,7 +305,7 @@
 			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
-			return (0);
+			return (B_TRUE);
 		}
 
 		vdev_cache_hit(vc, ve, zio);
@@ -312,7 +313,7 @@
 
 		mutex_exit(&vc->vc_lock);
 		VDCSTAT_BUMP(vdc_stat_hits);
-		return (0);
+		return (B_TRUE);
 	}
 
 	ve = vdev_cache_allocate(zio);
@@ -319,11 +320,11 @@
 
 	if (ve == NULL) {
 		mutex_exit(&vc->vc_lock);
-		return (SET_ERROR(ENOMEM));
+		return (B_FALSE);
 	}
 
 	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
@@ -334,7 +335,7 @@
 	zio_nowait(fio);
 	VDCSTAT_BUMP(vdc_stat_misses);
 
-	return (0);
+	return (B_TRUE);
 }
 
 /*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -41,12 +43,147 @@
 
 extern ldi_ident_t zfs_li;
 
-typedef struct vdev_disk_buf {
-	buf_t	vdb_buf;
-	zio_t	*vdb_io;
-} vdev_disk_buf_t;
+static void vdev_disk_close(vdev_t *);
 
+typedef struct vdev_disk_ldi_cb {
+	list_node_t		lcb_next;
+	ldi_callback_id_t	lcb_id;
+} vdev_disk_ldi_cb_t;
+
 static void
+vdev_disk_alloc(vdev_t *vd)
+{
+	vdev_disk_t *dvd;
+
+	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+	/*
+	 * Create the LDI event callback list.
+	 */
+	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
+	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
+}
+
+static void
+vdev_disk_free(vdev_t *vd)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
+
+	if (dvd == NULL)
+		return;
+
+	/*
+	 * We have already closed the LDI handle. Clean up the LDI event
+	 * callbacks and free vd->vdev_tsd.
+	 */
+	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
+		list_remove(&dvd->vd_ldi_cbs, lcb);
+		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
+		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
+	}
+	list_destroy(&dvd->vd_ldi_cbs);
+	kmem_free(dvd, sizeof (vdev_disk_t));
+	vd->vdev_tsd = NULL;
+}
+
+/* ARGSUSED */
+static int
+vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
+    void *ev_data)
+{
+	vdev_t *vd = (vdev_t *)arg;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	/*
+	 * Ignore events other than offline.
+	 */
+	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+		return (LDI_EV_SUCCESS);
+
+	/*
+	 * All LDI handles must be closed for the state change to succeed, so
+	 * call on vdev_disk_close() to do this.
+	 *
+	 * We inform vdev_disk_close that it is being called from offline
+	 * notify context so it will defer cleanup of LDI event callbacks and
+	 * freeing of vd->vdev_tsd to the offline finalize or a reopen.
+	 */
+	dvd->vd_ldi_offline = B_TRUE;
+	vdev_disk_close(vd);
+
+	/*
+	 * Now that the device is closed, request that the spa_async_thread
+	 * mark the device as REMOVED and notify FMA of the removal.
+	 */
+	zfs_post_remove(vd->vdev_spa, vd);
+	vd->vdev_remove_wanted = B_TRUE;
+	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+
+	return (LDI_EV_SUCCESS);
+}
+
+/* ARGSUSED */
+static void
+vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+    int ldi_result, void *arg, void *ev_data)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	/*
+	 * Ignore events other than offline.
+	 */
+	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+		return;
+
+	/*
+	 * We have already closed the LDI handle in notify.
+	 * Clean up the LDI event callbacks and free vd->vdev_tsd.
+	 */
+	vdev_disk_free(vd);
+
+	/*
+	 * Request that the vdev be reopened if the offline state change was
+	 * unsuccessful.
+	 */
+	if (ldi_result != LDI_EV_SUCCESS) {
+		vd->vdev_probe_wanted = B_TRUE;
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
+	}
+}
+
+static ldi_ev_callback_t vdev_disk_off_callb = {
+	.cb_vers = LDI_EV_CB_VERS,
+	.cb_notify = vdev_disk_off_notify,
+	.cb_finalize = vdev_disk_off_finalize
+};
+
+/* ARGSUSED */
+static void
+vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+    int ldi_result, void *arg, void *ev_data)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	/*
+	 * Ignore events other than degrade.
+	 */
+	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
+		return;
+
+	/*
+	 * Degrade events always succeed. Mark the vdev as degraded.
+	 * This status is purely informative for the user.
+	 */
+	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
+}
+
+static ldi_ev_callback_t vdev_disk_dgrd_callb = {
+	.cb_vers = LDI_EV_CB_VERS,
+	.cb_notify = NULL,
+	.cb_finalize = vdev_disk_dgrd_finalize
+};
+
+static void
 vdev_disk_hold(vdev_t *vd)
 {
 	ddi_devid_t devid;
@@ -105,48 +242,36 @@
 	}
 }
 
-static uint64_t
-vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
-{
-	ASSERT(vd->vdev_wholedisk);
+/*
+ * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
+ * even a fallback to DKIOCGMEDIAINFO fails.
+ */
+#ifdef DEBUG
+#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
+#else
+#define	VDEV_DEBUG(...)	/* Nothing... */
+#endif
 
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	dk_efi_t dk_ioc;
-	efi_gpt_t *efi;
-	uint64_t avail_space = 0;
-	int efisize = EFI_LABEL_SIZE * 2;
-
-	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
-	dk_ioc.dki_lba = 1;
-	dk_ioc.dki_length = efisize;
-	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
-	efi = dk_ioc.dki_data;
-
-	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
-	    FKIOCTL, kcred, NULL) == 0) {
-		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
-
-		zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
-		    vd->vdev_path, capacity, efi_altern_lba);
-		if (capacity > efi_altern_lba)
-			avail_space = (capacity - efi_altern_lba) * blksz;
-	}
-	kmem_free(dk_ioc.dki_data, efisize);
-	return (avail_space);
-}
-
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_disk_t *dvd;
-	struct dk_minfo_ext dkmext;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	ldi_ev_cookie_t ecookie;
+	vdev_disk_ldi_cb_t *lcb;
+	union {
+		struct dk_minfo_ext ude;
+		struct dk_minfo ud;
+	} dks;
+	struct dk_minfo_ext *dkmext = &dks.ude;
+	struct dk_minfo *dkm = &dks.ud;
 	int error;
 	dev_t dev;
 	int otyp;
 	boolean_t validate_devid = B_FALSE;
 	ddi_devid_t devid;
+	uint64_t capacity = 0, blksz = 0, pbsize;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
@@ -160,13 +285,25 @@
 	 * Reopen the device if it's not currently open. Otherwise,
 	 * just update the physical size of the device.
 	 */
-	if (vd->vdev_tsd != NULL) {
-		ASSERT(vd->vdev_reopening);
-		dvd = vd->vdev_tsd;
-		goto skip_open;
+	if (dvd != NULL) {
+		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
+			/*
+			 * If we are opening a device in its offline notify
+			 * context, the LDI handle was just closed. Clean
+			 * up the LDI event callbacks and free vd->vdev_tsd.
+			 */
+			vdev_disk_free(vd);
+		} else {
+			ASSERT(vd->vdev_reopening);
+			goto skip_open;
+		}
 	}
 
-	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+	/*
+	 * Create vd->vdev_tsd.
+	 */
+	vdev_disk_alloc(vd);
+	dvd = vd->vdev_tsd;
 
 	/*
 	 * When opening a disk device, we want to preserve the user's original
@@ -199,23 +336,28 @@
 		if (vd->vdev_wholedisk == -1ULL) {
 			size_t len = strlen(vd->vdev_path) + 3;
 			char *buf = kmem_alloc(len, KM_SLEEP);
-			ldi_handle_t lh;
 
 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
 
-			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
-			    &lh, zfs_li) == 0) {
+			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
+			    &dvd->vd_lh, zfs_li);
+			if (error == 0) {
 				spa_strfree(vd->vdev_path);
 				vd->vdev_path = buf;
 				vd->vdev_wholedisk = 1ULL;
-				(void) ldi_close(lh, spa_mode(spa), kcred);
 			} else {
 				kmem_free(buf, len);
 			}
 		}
 
-		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
-		    &dvd->vd_lh, zfs_li);
+		/*
+		 * If we have not yet opened the device, try to open it by the
+		 * specified path.
+		 */
+		if (error != 0) {
+			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+			    kcred, &dvd->vd_lh, zfs_li);
+		}
 
 		/*
 		 * Compare the devid to the stored value.
@@ -322,6 +464,27 @@
 		kmem_free(physpath, MAXPATHLEN);
 	}
 
+	/*
+	 * Register callbacks for the LDI offline event.
+	 */
+	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
+	    LDI_EV_SUCCESS) {
+		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
+	}
+
+	/*
+	 * Register callbacks for the LDI degrade event.
+	 */
+	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
+	    LDI_EV_SUCCESS) {
+		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
+	}
 skip_open:
 	/*
 	 * Determine the actual size of the device.
@@ -331,33 +494,53 @@
 		return (SET_ERROR(EINVAL));
 	}
 
+	*max_psize = *psize;
+
 	/*
 	 * Determine the device's minimum transfer size.
 	 * If the ioctl isn't supported, assume DEV_BSIZE.
 	 */
-	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
-	    FKIOCTL, kcred, NULL) != 0)
-		dkmext.dki_pbsize = DEV_BSIZE;
+	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
+	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
+		capacity = dkmext->dki_capacity - 1;
+		blksz = dkmext->dki_lbsize;
+		pbsize = dkmext->dki_pbsize;
+	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
+	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
+		VDEV_DEBUG(
+		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
+		    vd->vdev_path);
+		capacity = dkm->dki_capacity - 1;
+		blksz = dkm->dki_lbsize;
+		pbsize = blksz;
+	} else {
+		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
+		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
+		    vd->vdev_path, error);
+		pbsize = DEV_BSIZE;
+	}
 
-	*ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
+	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
 
 	if (vd->vdev_wholedisk == 1) {
-		uint64_t capacity = dkmext.dki_capacity - 1;
-		uint64_t blksz = dkmext.dki_lbsize;
 		int wce = 1;
 
+		if (error == 0) {
+			/*
+			 * If we have the capability to expand, we'd have
+			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
+			 * Adjust max_psize upward accordingly since we know
+			 * we own the whole disk now.
+			 */
+			*max_psize = capacity * blksz;
+		}
+
 		/*
-		 * If we own the whole disk, try to enable disk write caching.
-		 * We ignore errors because it's OK if we can't do it.
+		 * Since we own the whole disk, try to enable disk write
+		 * caching.  We ignore errors because it's OK if we can't do it.
 		 */
 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
 		    FKIOCTL, kcred, NULL);
-
-		*max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
-		zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
-		    "max_psize %llu", vd->vdev_path, *psize, *max_psize);
-	} else {
-		*max_psize = *psize;
 	}
 
 	/*
@@ -377,24 +560,65 @@
 	if (vd->vdev_reopening || dvd == NULL)
 		return;
 
-	if (dvd->vd_minor != NULL)
+	if (dvd->vd_minor != NULL) {
 		ddi_devid_str_free(dvd->vd_minor);
+		dvd->vd_minor = NULL;
+	}
 
-	if (dvd->vd_devid != NULL)
+	if (dvd->vd_devid != NULL) {
 		ddi_devid_free(dvd->vd_devid);
+		dvd->vd_devid = NULL;
+	}
 
-	if (dvd->vd_lh != NULL)
+	if (dvd->vd_lh != NULL) {
 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+		dvd->vd_lh = NULL;
+	}
 
 	vd->vdev_delayed_close = B_FALSE;
-	kmem_free(dvd, sizeof (vdev_disk_t));
-	vd->vdev_tsd = NULL;
+	/*
+	 * If we closed the LDI handle due to an offline notify from LDI,
+	 * don't free vd->vdev_tsd or unregister the callbacks here;
+	 * the offline finalize callback or a reopen will take care of it.
+	 */
+	if (dvd->vd_ldi_offline)
+		return;
+
+	vdev_disk_free(vd);
 }
 
 int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
-    uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags, boolean_t isdump)
 {
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
+		return (EIO);
+
+	ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+	/*
+	 * If in the context of an active crash dump, use the ldi_dump(9F)
+	 * call instead of ldi_strategy(9F) as usual.
+	 */
+	if (isdump) {
+		ASSERT3P(dvd, !=, NULL);
+		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+		    lbtodb(size)));
+	}
+
+	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+    size_t size, uint64_t offset, int flags)
+{
 	buf_t *bp;
 	int error = 0;
 
@@ -422,8 +646,8 @@
 static void
 vdev_disk_io_intr(buf_t *bp)
 {
-	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
-	zio_t *zio = vdb->vdb_io;
+	vdev_buf_t *vb = (vdev_buf_t *)bp;
+	zio_t *zio = vb->vb_io;
 
 	/*
 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
@@ -430,14 +654,14 @@
 	 * Rather than teach the rest of the stack about other error
 	 * possibilities (EFAULT, etc), we normalize the error value here.
 	 */
-	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
+	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
 
 	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = SET_ERROR(EIO);
 
-	kmem_free(vdb, sizeof (vdev_disk_buf_t));
+	kmem_free(vb, sizeof (vdev_buf_t));
 
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 }
 
 static void
@@ -461,21 +685,32 @@
 	zio_interrupt(zio);
 }
 
-static int
+static void
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_disk_t *dvd = vd->vdev_tsd;
-	vdev_disk_buf_t *vdb;
+	vdev_buf_t *vb;
 	struct dk_callback *dkc;
 	buf_t *bp;
 	int error;
 
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
-			return (ZIO_PIPELINE_CONTINUE);
+			zio_interrupt(zio);
+			return;
 		}
 
 		switch (zio->io_cmd) {
@@ -506,7 +741,7 @@
 				 * and will call vdev_disk_ioctl_done()
 				 * upon completion.
 				 */
-				return (ZIO_PIPELINE_STOP);
+				return;
 			}
 
 			if (error == ENOTSUP || error == ENOTTY) {
@@ -527,14 +762,18 @@
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
-	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
-	vdb->vdb_io = zio;
-	bp = &vdb->vdb_buf;
+	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 
+	vb->vb_io = zio;
+	bp = &vb->vb_buf;
+
 	bioinit(bp);
 	bp->b_flags = B_BUSY | B_NOCACHE |
 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
@@ -548,8 +787,6 @@
 
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
-
-	return (ZIO_PIPELINE_STOP);
 }
 
 static void
@@ -641,7 +878,7 @@
 
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
-		if (vdev_disk_physio(vd_lh, (caddr_t)label,
+		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -36,6 +36,21 @@
  * Virtual device vector for files.
  */
 
+static taskq_t *vdev_file_taskq;
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+	    minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
 static void
 vdev_file_hold(vdev_t *vd)
 {
@@ -50,12 +65,12 @@
 
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_file_t *vf;
 	vnode_t *vp;
 	vattr_t vattr;
-	int error, vfslocked;
+	int error;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
@@ -119,11 +134,9 @@
 	 * Determine the physical size of the file.
 	 */
 	vattr.va_mask = AT_SIZE;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &vattr, kcred);
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	if (error) {
 		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -132,8 +145,11 @@
 		return (error);
 	}
 
+	vd->vdev_notrim = B_TRUE;
+
 	*max_psize = *psize = vattr.va_size;
-	*ashift = SPA_MINBLOCKSHIFT;
+	*logical_ashift = SPA_MINBLOCKSHIFT;
+	*physical_ashift = SPA_MINBLOCKSHIFT;
 
 	return (0);
 }
@@ -156,26 +172,58 @@
 	vd->vdev_tsd = NULL;
 }
 
-static int
-vdev_file_io_start(zio_t *zio)
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
 {
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = arg;
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf;
 	vnode_t *vp;
 	ssize_t resid;
 
-	if (!vdev_readable(vd)) {
-		zio->io_error = SET_ERROR(ENXIO);
-		return (ZIO_PIPELINE_CONTINUE);
-	}
-
 	vf = vd->vdev_tsd;
 	vp = vf->vf_vnode;
 
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+	    UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
+	    zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = ENOSPC;
+
+	vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC,
+			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 			    kcred, NULL);
 			break;
 		default:
@@ -182,19 +230,15 @@
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
-	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
-	    UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
-	    zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
-	if (resid != 0 && zio->io_error == 0)
-		zio->io_error = ENOSPC;
-
-	zio_interrupt(zio);
-
-	return (ZIO_PIPELINE_STOP);
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, 0);
 }
 
 /* ARGSUSED */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -43,11 +43,23 @@
  * Virtual device vector for GEOM.
  */
 
+static g_attrchanged_t vdev_geom_attrchanged;
 struct g_class zfs_vdev_class = {
 	.name = "ZFS::VDEV",
 	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
 };
 
+struct consumer_vdev_elem {
+	SLIST_ENTRY(consumer_vdev_elem)	elems;
+	vdev_t				*vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
+    == sizeof(struct consumer_priv_t*),
+    "consumer_priv_t* can't be stored in g_consumer.private");
+
 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
 
 SYSCTL_DECL(_vfs_zfs_vdev);
@@ -62,15 +74,103 @@
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
 
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
 static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{ 
+	int error;
+	uint16_t rate;
+
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0)
+		vd->vdev_rotation_rate = rate;
+	else
+		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
+}
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+		       boolean_t do_null_update)
+{
+	boolean_t needs_update = B_FALSE;
+	char *physpath;
+	int error, physpath_len;
+
+	physpath_len = MAXPATHLEN;
+	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+	if (error == 0) {
+		char *old_physpath;
+
+		/* g_topology lock ensures that vdev has not been closed */
+		g_topology_assert();
+		old_physpath = vd->vdev_physpath;
+		vd->vdev_physpath = spa_strdup(physpath);
+
+		if (old_physpath != NULL) {
+			needs_update = (strcmp(old_physpath,
+						vd->vdev_physpath) != 0);
+			spa_strfree(old_physpath);
+		} else
+			needs_update = do_null_update;
+	}
+	g_free(physpath);
+
+	/*
+	 * If the physical path changed, update the config.
+	 * Only request an update for previously unset physpaths if
+	 * requested by the caller.
+	 */
+	if (needs_update)
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	char *old_physpath;
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
+	int error;
+
+	priv = (struct consumer_priv_t*)&cp->private;
+	if (SLIST_EMPTY(priv))
+		return;
+
+	SLIST_FOREACH(elem, priv, elems) {
+		vdev_t *vd = elem->vd;
+		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+			vdev_geom_set_rotation_rate(vd, cp);
+			return;
+		}
+		if (strcmp(attr, "GEOM::physpath") == 0) {
+			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
+			return;
+		}
+	}
+}
+
+static void
 vdev_geom_orphan(struct g_consumer *cp)
 {
-	vdev_t *vd;
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
 
 	g_topology_assert();
 
-	vd = cp->private;
-	if (vd == NULL)
+	priv = (struct consumer_priv_t*)&cp->private;
+	if (SLIST_EMPTY(priv))
+		/* Vdev close in progress.  Ignore the event. */
 		return;
 
 	/*
@@ -87,20 +187,39 @@
 	 * async removal support to invoke a close on this
 	 * vdev once it is safe to do so.
 	 */
-	zfs_post_remove(vd->vdev_spa, vd);
-	vd->vdev_remove_wanted = B_TRUE;
-	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+	SLIST_FOREACH(elem, priv, elems) {
+		vdev_t *vd = elem->vd;
+
+		vd->vdev_remove_wanted = B_TRUE;
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+	}
 }
 
 static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp)
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
+	int error;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+	if (sanity) {
+		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+			ZFS_LOG(1, "Failing attach of %s. "
+				   "Incompatible sectorsize %d\n",
+			    pp->name, pp->sectorsize);
+			return (NULL);
+		} else if (pp->mediasize < SPA_MINDEVSIZE) {
+			ZFS_LOG(1, "Failing attach of %s. "
+				   "Incompatible mediasize %ju\n",
+			    pp->name, pp->mediasize);
+			return (NULL);
+		}
+	}
+
 	/* Do we have geom already? No? Create one. */
 	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
 		if (gp->flags & G_GEOM_WITHER)
@@ -112,13 +231,20 @@
 	if (gp == NULL) {
 		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 		gp->orphan = vdev_geom_orphan;
+		gp->attrchanged = vdev_geom_attrchanged;
 		cp = g_new_consumer(gp);
-		if (g_attach(cp, pp) != 0) {
-			g_wither_geom(gp, ENXIO);
+		error = g_attach(cp, pp);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
-		if (g_access(cp, 1, 0, 1) != 0) {
-			g_wither_geom(gp, ENXIO);
+		error = g_access(cp, 1, 0, 1);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
+			       __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
 		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
@@ -132,43 +258,61 @@
 		}
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
-			if (g_attach(cp, pp) != 0) {
-				g_destroy_consumer(cp);
+			error = g_attach(cp, pp);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
-			if (g_access(cp, 1, 0, 1) != 0) {
-				g_detach(cp);
-				g_destroy_consumer(cp);
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
 			ZFS_LOG(1, "Created consumer for %s.", pp->name);
 		} else {
-			if (g_access(cp, 1, 0, 1) != 0)
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
 				return (NULL);
+			}
 			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 		}
 	}
+
+	if (vd != NULL)
+		vd->vdev_tsd = cp;
+
+	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	return (cp);
 }
 
 static void
-vdev_geom_detach(void *arg, int flag __unused)
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
 {
 	struct g_geom *gp;
-	struct g_consumer *cp;
 
 	g_topology_assert();
-	cp = arg;
+
+	ZFS_LOG(1, "Detaching from %s.",
+	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
 	gp = cp->geom;
-
-	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
-	g_access(cp, -1, 0, -1);
+	if (open_for_read)
+		g_access(cp, -1, 0, -1);
 	/* Destroy consumer on last close. */
 	if (cp->acr == 0 && cp->ace == 0) {
-		ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
 		if (cp->acw > 0)
 			g_access(cp, 0, -cp->acw, 0);
-		g_detach(cp);
+		if (cp->provider != NULL) {
+			ZFS_LOG(1, "Destroying consumer for %s.",
+			    cp->provider->name ? cp->provider->name : "NULL");
+			g_detach(cp);
+		}
 		g_destroy_consumer(cp);
 	}
 	/* Destroy geom if there are no consumers left. */
@@ -178,70 +322,115 @@
 	}
 }
 
-static uint64_t
-nvlist_get_guid(nvlist_t *list)
+static void
+vdev_geom_close_locked(vdev_t *vd)
 {
-	uint64_t value;
+	struct g_consumer *cp;
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem, *elem_temp;
 
-	value = 0;
-	nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, &value);
-	return (value);
+	g_topology_assert();
+
+	cp = vd->vdev_tsd;
+	vd->vdev_delayed_close = B_FALSE;
+	if (cp == NULL)
+		return;
+
+	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+	priv = (struct consumer_priv_t*)&cp->private;
+	vd->vdev_tsd = NULL;
+	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+		if (elem->vd == vd) {
+			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+			g_free(elem);
+		}
+	}
+
+	vdev_geom_detach(cp, B_TRUE);
 }
 
-static int
-vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
+ * operation is described by parallel entries from each array.  There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+    off_t *sizes, int *errors, int ncmds)
 {
-	struct bio *bp;
+	struct bio **bios;
 	u_char *p;
-	off_t off, maxio;
-	int error;
+	off_t off, maxio, s, end;
+	int i, n_bios, j;
+	size_t bios_size;
 
-	ASSERT((offset % cp->provider->sectorsize) == 0);
-	ASSERT((size % cp->provider->sectorsize) == 0);
-
-	bp = g_alloc_bio();
-	off = offset;
-	offset += size;
-	p = data;
 	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
-	error = 0;
+	n_bios = 0;
 
-	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
-		bzero(bp, sizeof(*bp));
-		bp->bio_cmd = cmd;
-		bp->bio_done = NULL;
-		bp->bio_offset = off;
-		bp->bio_length = MIN(size, maxio);
-		bp->bio_data = p;
-		g_io_request(bp, cp);
-		error = biowait(bp, "vdev_geom_io");
-		if (error != 0)
-			break;
+	/* How many bios are required for all commands ? */
+	for (i = 0; i < ncmds; i++)
+		n_bios += (sizes[i] + maxio - 1) / maxio;
+
+	/* Allocate memory for the bios */
+	bios_size = n_bios * sizeof(struct bio*);
+	bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+	/* Prepare and issue all of the bios */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		p = datas[i];
+		s = sizes[i];
+		end = off + s;
+		ASSERT((off % cp->provider->sectorsize) == 0);
+		ASSERT((s % cp->provider->sectorsize) == 0);
+
+		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+			bios[j] = g_alloc_bio();
+			bios[j]->bio_cmd = cmds[i];
+			bios[j]->bio_done = NULL;
+			bios[j]->bio_offset = off;
+			bios[j]->bio_length = MIN(s, maxio);
+			bios[j]->bio_data = p;
+			g_io_request(bios[j], cp);
+		}
 	}
+	ASSERT(j == n_bios);
 
-	g_destroy_bio(bp);
-	return (error);
-}
+	/* Wait for all of the bios to complete, and clean them up */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		s = sizes[i];
+		end = off + s;
 
-static void
-vdev_geom_taste_orphan(struct g_consumer *cp)
-{
-
-	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
-	    cp->provider->name));
+		for (; off < end; off += maxio, s -= maxio, j++) {
+			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
+			g_destroy_bio(bios[j]);
+		}
+	}
+	kmem_free(bios, bios_size);
 }
 
+/* 
+ * Read the vdev config from a device.  Return the number of valid labels that
+ * were found.  The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
 static int
 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
 {
 	struct g_provider *pp;
-	vdev_label_t *label;
-	char *p, *buf;
+	vdev_phys_t *vdev_lists[VDEV_LABELS];
+	char *buf;
 	size_t buflen;
-	uint64_t psize;
-	off_t offset, size;
-	uint64_t guid, state, txg;
-	int error, l, len;
+	uint64_t psize, state, txg;
+	off_t offsets[VDEV_LABELS];
+	off_t size;
+	off_t sizes[VDEV_LABELS];
+	int cmds[VDEV_LABELS];
+	int errors[VDEV_LABELS];
+	int l, nlabels;
 
 	g_topology_assert_not();
 
@@ -251,24 +440,34 @@
 	psize = pp->mediasize;
 	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
 
-	size = sizeof(*label) + pp->sectorsize -
-	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
+	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
+	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
 
-	guid = 0;
-	label = kmem_alloc(size, KM_SLEEP);
-	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
+	buflen = sizeof(vdev_lists[0]->vp_nvlist);
 
 	*config = NULL;
+	/* Create all of the IO requests */
 	for (l = 0; l < VDEV_LABELS; l++) {
+		cmds[l] = BIO_READ;
+		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+		sizes[l] = size;
+		errors[l] = 0;
+		ASSERT(offsets[l] % pp->sectorsize == 0);
+	}
 
-		offset = vdev_label_offset(psize, l, 0);
-		if ((offset % pp->sectorsize) != 0)
-			continue;
+	/* Issue the IO requests */
+	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+	    VDEV_LABELS);
 
-		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
+	/* Parse the labels */
+	nlabels = 0;
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (errors[l] != 0)
 			continue;
-		buf = label->vl_vdev_phys.vp_nvlist;
 
+		buf = vdev_lists[l]->vp_nvlist;
+
 		if (nvlist_unpack(buf, buflen, config, 0) != 0)
 			continue;
 
@@ -279,7 +478,8 @@
 			continue;
 		}
 
-		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+		if (state != POOL_STATE_SPARE &&
+		    state != POOL_STATE_L2CACHE &&
 		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0)) {
 			nvlist_free(*config);
@@ -287,11 +487,14 @@
 			continue;
 		}
 
-		break;
+		nlabels++;
 	}
 
-	kmem_free(label, size);
-	return (*config == NULL ? ENOENT : 0);
+	/* Free the label storage */
+	for (l = 0; l < VDEV_LABELS; l++)
+		kmem_free(vdev_lists[l], size);
+
+	return (nlabels);
 }
 
 static void
@@ -364,49 +567,21 @@
 	nvlist_free(cfg);
 }
 
-static int
-vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
-{
-	int error;
-
-	if (pp->flags & G_PF_WITHER)
-		return (EINVAL);
-	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
-		return (EINVAL);
-	g_attach(cp, pp);
-	error = g_access(cp, 1, 0, 0);
-	if (error != 0)
-		g_detach(cp);
-	return (error);
-}
-
-static void
-vdev_geom_detach_taster(struct g_consumer *cp)
-{
-	g_access(cp, -1, 0, 0);
-	g_detach(cp);
-}
-
 int
 vdev_geom_read_pool_label(const char *name,
     nvlist_t ***configs, uint64_t *count)
 {
 	struct g_class *mp;
-	struct g_geom *gp, *zgp;
+	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *zcp;
 	nvlist_t *vdev_cfg;
 	uint64_t pool_guid;
-	int error;
+	int error, nlabels;
 
 	DROP_GIANT();
 	g_topology_lock();
 
-	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
-	/* This orphan function should be never called. */
-	zgp->orphan = vdev_geom_taste_orphan;
-	zcp = g_new_consumer(zgp);
-
 	*configs = NULL;
 	*count = 0;
 	pool_guid = 0;
@@ -419,13 +594,14 @@
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				if (pp->flags & G_PF_WITHER)
 					continue;
-				if (vdev_geom_attach_taster(zcp, pp) != 0)
+				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+				if (zcp == NULL)
 					continue;
 				g_topology_unlock();
-				error = vdev_geom_read_config(zcp, &vdev_cfg);
+				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
 				g_topology_lock();
-				vdev_geom_detach_taster(zcp);
-				if (error)
+				vdev_geom_detach(zcp, B_TRUE);
+				if (nlabels == 0)
 					continue;
 				ZFS_LOG(1, "successfully read vdev config");
 
@@ -434,9 +610,6 @@
 			}
 		}
 	}
-
-	g_destroy_consumer(zcp);
-	g_destroy_geom(zgp);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
@@ -443,39 +616,89 @@
 	return (*count > 0 ? 0 : ENOENT);
 }
 
-static uint64_t
-vdev_geom_read_guid(struct g_consumer *cp)
+enum match {
+	NO_MATCH = 0,		/* No matching labels found */
+	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
+	ZERO_MATCH = 1,		/* Should never be returned */
+	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
+	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
+	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
+	FULL_MATCH = 5		/* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
 {
 	nvlist_t *config;
-	uint64_t guid;
+	uint64_t pool_guid, top_guid, vdev_guid;
+	struct g_consumer *cp;
+	int nlabels;
 
-	g_topology_assert_not();
+	cp = vdev_geom_attach(pp, NULL, B_TRUE);
+	if (cp == NULL) {
+		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+		    pp->name);
+		return (NO_MATCH);
+	}
+	g_topology_unlock();
+	nlabels = vdev_geom_read_config(cp, &config);
+	g_topology_lock();
+	vdev_geom_detach(cp, B_TRUE);
+	if (nlabels == 0) {
+		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+		return (NO_MATCH);
+	}
 
-	guid = 0;
-	if (vdev_geom_read_config(cp, &config) == 0) {
-		guid = nvlist_get_guid(config);
-		nvlist_free(config);
+	pool_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+	top_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+	vdev_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+	nvlist_free(config);
+
+	/*
+	 * Check that the label's pool guid matches the desired guid.
+	 * Inactive spares and L2ARCs do not have any pool guid in the label.
+	 */
+	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+		    pp->name,
+		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+		return (NO_MATCH);
 	}
-	return (guid);
+
+	/*
+	 * Check that the label's vdev guid matches the desired guid.
+	 * The second condition handles possible race on vdev detach, when
+	 * remaining vdev receives GUID of destroyed top level mirror vdev.
+	 */
+	if (vdev_guid == vd->vdev_guid) {
+		ZFS_LOG(1, "guids match for provider %s.", pp->name);
+		return (ZERO_MATCH + nlabels);
+	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+		return (TOPGUID_MATCH);
+	}
+	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+	return (NO_MATCH);
 }
 
 static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid)
+vdev_geom_attach_by_guids(vdev_t *vd)
 {
 	struct g_class *mp;
-	struct g_geom *gp, *zgp;
-	struct g_provider *pp;
-	struct g_consumer *cp, *zcp;
-	uint64_t pguid;
+	struct g_geom *gp;
+	struct g_provider *pp, *best_pp;
+	struct g_consumer *cp;
+	enum match match, best_match;
 
 	g_topology_assert();
 
-	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
-	/* This orphan function should be never called. */
-	zgp->orphan = vdev_geom_taste_orphan;
-	zcp = g_new_consumer(zgp);
-
 	cp = NULL;
+	best_pp = NULL;
+	best_match = NO_MATCH;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
@@ -483,36 +706,30 @@
 			if (gp->flags & G_GEOM_WITHER)
 				continue;
 			LIST_FOREACH(pp, &gp->provider, provider) {
-				if (vdev_geom_attach_taster(zcp, pp) != 0)
-					continue;
-				g_topology_unlock();
-				pguid = vdev_geom_read_guid(zcp);
-				g_topology_lock();
-				vdev_geom_detach_taster(zcp);
-				if (pguid != guid)
-					continue;
-				cp = vdev_geom_attach(pp);
-				if (cp == NULL) {
-					printf("ZFS WARNING: Unable to attach to %s.\n",
-					    pp->name);
-					continue;
+				match = vdev_attach_ok(vd, pp);
+				if (match > best_match) {
+					best_match = match;
+					best_pp = pp;
 				}
-				break;
+				if (match == FULL_MATCH)
+					goto out;
 			}
-			if (cp != NULL)
-				break;
 		}
-		if (cp != NULL)
-			break;
 	}
-end:
-	g_destroy_consumer(zcp);
-	g_destroy_geom(zgp);
+
+out:
+	if (best_pp) {
+		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+		if (cp == NULL) {
+			printf("ZFS WARNING: Unable to attach to %s.\n",
+			    best_pp->name);
+		}
+	}
 	return (cp);
 }
 
 static struct g_consumer *
-vdev_geom_open_by_guid(vdev_t *vd)
+vdev_geom_open_by_guids(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	char *buf;
@@ -520,8 +737,9 @@
 
 	g_topology_assert();
 
-	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
-	cp = vdev_geom_attach_by_guid(vd->vdev_guid);
+	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+	cp = vdev_geom_attach_by_guids(vd);
 	if (cp != NULL) {
 		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 		buf = kmem_alloc(len, KM_SLEEP);
@@ -530,10 +748,12 @@
 		spa_strfree(vd->vdev_path);
 		vd->vdev_path = buf;
 
-		ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
-		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
+		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid, cp->provider->name);
 	} else {
-		ZFS_LOG(1, "Search by guid [%ju] failed.",
+		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid);
 	}
 
@@ -545,7 +765,6 @@
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
-	uint64_t guid;
 
 	g_topology_assert();
 
@@ -553,23 +772,8 @@
 	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
 	if (pp != NULL) {
 		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
-		cp = vdev_geom_attach(pp);
-		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
-		    pp->sectorsize <= VDEV_PAD_SIZE) {
-			g_topology_unlock();
-			guid = vdev_geom_read_guid(cp);
-			g_topology_lock();
-			if (guid != vd->vdev_guid) {
-				vdev_geom_detach(cp, 0);
-				cp = NULL;
-				ZFS_LOG(1, "guid mismatch for provider %s: "
-				    "%ju != %ju.", vd->vdev_path,
-				    (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
-			} else {
-				ZFS_LOG(1, "guid match for provider %s.",
-				    vd->vdev_path);
-			}
-		}
+		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+			cp = vdev_geom_attach(pp, vd, B_FALSE);
 	}
 
 	return (cp);
@@ -577,7 +781,7 @@
 
 static int
 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
@@ -584,6 +788,9 @@
 	size_t bufsize;
 	int error;
 
+	/* Set the TLS to indicate downstack that we should not access zvols*/
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
@@ -592,60 +799,111 @@
 		return (EINVAL);
 	}
 
-	vd->vdev_tsd = NULL;
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if ((cp = vd->vdev_tsd) != NULL) {
+		ASSERT(vd->vdev_reopening);
+		goto skip_open;
+	}
 
 	DROP_GIANT();
 	g_topology_lock();
 	error = 0;
 
-	/*
-	 * If we're creating or splitting a pool, just find the GEOM provider
-	 * by its name and ignore GUID mismatches.
-	 */
-	if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
-	    vd->vdev_spa->spa_splitting_newspa == B_TRUE)
+	if (vd->vdev_spa->spa_splitting_newspa ||
+	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
+		/*
+		 * We are dealing with a vdev that hasn't been previously
+		 * opened (since boot), and we are not loading an
+		 * existing pool configuration.  This looks like a
+		 * vdev add operation to a new or existing pool.
+		 * Assume the user knows what he/she is doing and find
+		 * GEOM provider by its name, ignoring GUID mismatches.
+		 *
+		 * XXPOLICY: It would be safer to only allow a device
+		 *           that is unlabeled or labeled but missing
+		 *           GUID information to be opened in this fashion,
+		 *           unless we are doing a split, in which case we
+		 *           should allow any guid.
+		 */
 		cp = vdev_geom_open_by_path(vd, 0);
-	else {
+	} else {
+		/*
+		 * Try using the recorded path for this device, but only
+		 * accept it if its label data contains the expected GUIDs.
+		 */
 		cp = vdev_geom_open_by_path(vd, 1);
 		if (cp == NULL) {
 			/*
 			 * The device at vd->vdev_path doesn't have the
-			 * expected guid. The disks might have merely
+			 * expected GUIDs. The disks might have merely
 			 * moved around so try all other GEOM providers
-			 * to find one with the right guid.
+			 * to find one with the right GUIDs.
 			 */
-			cp = vdev_geom_open_by_guid(vd);
+			cp = vdev_geom_open_by_guids(vd);
 		}
 	}
 
+	/* Clear the TLS now that tasting is done */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
 	if (cp == NULL) {
-		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
+		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
 		error = ENOENT;
-	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
-	    !ISP2(cp->provider->sectorsize)) {
-		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
-		    vd->vdev_path);
-		vdev_geom_detach(cp, 0);
-		error = EINVAL;
-		cp = NULL;
-	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
-		int i;
+	} else {
+		struct consumer_priv_t *priv;
+		struct consumer_vdev_elem *elem;
+		int spamode;
 
-		for (i = 0; i < 5; i++) {
-			error = g_access(cp, 0, 1, 0);
-			if (error == 0)
-				break;
-			g_topology_unlock();
-			tsleep(vd, 0, "vdev", hz / 2);
-			g_topology_lock();
-		}
-		if (error != 0) {
-			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
-			    vd->vdev_path, error);
-			vdev_geom_detach(cp, 0);
+		priv = (struct consumer_priv_t*)&cp->private;
+		if (cp->private == NULL)
+			SLIST_INIT(priv);
+		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
+		elem->vd = vd;
+		SLIST_INSERT_HEAD(priv, elem, elems);
+
+		spamode = spa_mode(vd->vdev_spa);
+		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+		    !ISP2(cp->provider->sectorsize)) {
+			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+			    cp->provider->name);
+
+			vdev_geom_close_locked(vd);
+			error = EINVAL;
 			cp = NULL;
+		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+			int i;
+
+			for (i = 0; i < 5; i++) {
+				error = g_access(cp, 0, 1, 0);
+				if (error == 0)
+					break;
+				g_topology_unlock();
+				tsleep(vd, 0, "vdev", hz / 2);
+				g_topology_lock();
+			}
+			if (error != 0) {
+				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
+				    cp->provider->name, error);
+				vdev_geom_close_locked(vd);
+				cp = NULL;
+			}
 		}
 	}
+
+	/* Fetch initial physical path information for this device. */
+	if (cp != NULL) {
+		vdev_geom_attrchanged(cp, "GEOM::physpath");
+	
+		/* Set other GEOM characteristics */
+		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
+		vdev_geom_set_rotation_rate(vd, cp);
+	}
+
 	g_topology_unlock();
 	PICKUP_GIANT();
 	if (cp == NULL) {
@@ -652,9 +910,7 @@
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
-
-	cp->private = vd;
-	vd->vdev_tsd = cp;
+skip_open:
 	pp = cp->provider;
 
 	/*
@@ -663,9 +919,14 @@
 	*max_psize = *psize = pp->mediasize;
 
 	/*
-	 * Determine the device's minimum transfer size.
+	 * Determine the device's minimum transfer size and preferred
+	 * transfer size.
 	 */
-	*ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+	*physical_ashift = 0;
+	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
+	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
+		*physical_ashift = highbit(pp->stripesize) - 1;
 
 	/*
 	 * Clear the nowritecache settings, so that on a vdev_reopen()
@@ -673,12 +934,6 @@
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
-	if (vd->vdev_physpath != NULL)
-		spa_strfree(vd->vdev_physpath);
-	bufsize = sizeof("/dev/") + strlen(pp->name);
-	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
-	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
-
 	return (0);
 }
 
@@ -688,12 +943,17 @@
 	struct g_consumer *cp;
 
 	cp = vd->vdev_tsd;
-	if (cp == NULL)
-		return;
-	vd->vdev_tsd = NULL;
-	vd->vdev_delayed_close = B_FALSE;
-	cp->private = NULL;	/* XXX locking */
-	g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
+
+	DROP_GIANT();
+	g_topology_lock();
+
+	if (!vd->vdev_reopening ||
+	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+	    (cp->provider != NULL && cp->provider->error != 0))))
+		vdev_geom_close_locked(vd);
+
+	g_topology_unlock();
+	PICKUP_GIANT();
 }
 
 static void
@@ -706,50 +966,46 @@
 	vd = zio->io_vd;
 	zio->io_error = bp->bio_error;
 	if (zio->io_error == 0 && bp->bio_resid != 0)
-		zio->io_error = EIO;
-	if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
+		zio->io_error = SET_ERROR(EIO);
+
+	switch(zio->io_error) {
+	case ENOTSUP:
 		/*
-		 * If we get ENOTSUP, we know that no future
-		 * attempts will ever succeed.  In this case we
-		 * set a persistent bit so that we don't bother
-		 * with the ioctl in the future.
+		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+		 * that future attempts will never succeed. In this case
+		 * we set a persistent flag so that we don't bother with
+		 * requests in the future.
 		 */
-		vd->vdev_nowritecache = B_TRUE;
-	}
-	if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) {
-		/*
-		 * If we get ENOTSUP, we know that no future
-		 * attempts will ever succeed.  In this case we
-		 * set a persistent bit so that we don't bother
-		 * with the ioctl in the future.
-		 */
-		vd->vdev_notrim = B_TRUE;
-	}
-	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
-		/*
-		 * If provider's error is set we assume it is being
-		 * removed.
-		 */
-		if (bp->bio_to->error != 0) {
+		switch(bp->bio_cmd) {
+		case BIO_FLUSH:
+			vd->vdev_nowritecache = B_TRUE;
+			break;
+		case BIO_DELETE:
+			vd->vdev_notrim = B_TRUE;
+			break;
+		}
+		break;
+	case ENXIO:
+		if (!vd->vdev_remove_wanted) {
 			/*
-			 * We post the resource as soon as possible, instead of
-			 * when the async removal actually happens, because the
-			 * DE is using this information to discard previous I/O
-			 * errors.
+			 * If provider's error is set we assume it is being
+			 * removed.
 			 */
-			/* XXX: zfs_post_remove() can sleep. */
-			zfs_post_remove(zio->io_spa, vd);
-			vd->vdev_remove_wanted = B_TRUE;
-			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
-		} else if (!vd->vdev_delayed_close) {
-			vd->vdev_delayed_close = B_TRUE;
+			if (bp->bio_to->error != 0) {
+				vd->vdev_remove_wanted = B_TRUE;
+				spa_async_request(zio->io_spa,
+				    SPA_ASYNC_REMOVE);
+			} else if (!vd->vdev_delayed_close) {
+				vd->vdev_delayed_close = B_TRUE;
+			}
 		}
+		break;
 	}
 	g_destroy_bio(bp);
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 }
 
-static int
+static void
 vdev_geom_io_start(zio_t *zio)
 {
 	vdev_t *vd;
@@ -759,41 +1015,50 @@
 
 	vd = zio->io_vd;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
-			zio->io_error = ENXIO;
-			return (ZIO_PIPELINE_CONTINUE);
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		} else {
+			switch (zio->io_cmd) {
+			case DKIOCFLUSHWRITECACHE:
+				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
+					break;
+				if (vd->vdev_nowritecache) {
+					zio->io_error = SET_ERROR(ENOTSUP);
+					break;
+				}
+				goto sendreq;
+			default:
+				zio->io_error = SET_ERROR(ENOTSUP);
+			}
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
-				break;
-			if (vd->vdev_nowritecache) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_FREE:
+		if (vd->vdev_notrim) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+		} else if (!vdev_geom_bio_delete_disable) {
 			goto sendreq;
-		case DKIOCTRIM:
-			if (vdev_geom_bio_delete_disable)
-				break;
-			if (vd->vdev_notrim) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
-			goto sendreq;
-		default:
-			zio->io_error = ENOTSUP;
 		}
-
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 sendreq:
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FREE ||
+	    zio->io_type == ZIO_TYPE_IOCTL);
+
 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
-		zio->io_error = ENXIO;
-		return (ZIO_PIPELINE_CONTINUE);
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
 	}
 	bp = g_alloc_bio();
 	bp->bio_caller1 = zio;
@@ -800,34 +1065,29 @@
 	switch (zio->io_type) {
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
 		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
 		bp->bio_data = zio->io_data;
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
+	case ZIO_TYPE_FREE:
+		bp->bio_cmd = BIO_DELETE;
+		bp->bio_data = NULL;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
 	case ZIO_TYPE_IOCTL:
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			bp->bio_cmd = BIO_FLUSH;
-			bp->bio_flags |= BIO_ORDERED;
-			bp->bio_data = NULL;
-			bp->bio_offset = cp->provider->mediasize;
-			bp->bio_length = 0;
-			break;
-		case DKIOCTRIM:
-			bp->bio_cmd = BIO_DELETE;
-			bp->bio_data = NULL;
-			bp->bio_offset = zio->io_offset;
-			bp->bio_length = zio->io_size;
-			break;
-		}
+		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_flags |= BIO_ORDERED;
+		bp->bio_data = NULL;
+		bp->bio_offset = cp->provider->mediasize;
+		bp->bio_length = 0;
 		break;
 	}
 	bp->bio_done = vdev_geom_io_intr;
 
 	g_io_request(bp, cp);
-
-	return (ZIO_PIPELINE_STOP);
 }
 
 static void

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -186,7 +186,7 @@
 
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private, int flags)
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
 	    SCL_STATE_ALL);
@@ -200,7 +200,7 @@
 
 static void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private, int flags)
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
 	    (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
@@ -223,30 +223,25 @@
 {
 	nvlist_t *nv = NULL;
 
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	nv = fnvlist_alloc();
 
-	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
-	    vd->vdev_ops->vdev_op_type) == 0);
+	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
-		    == 0);
-	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	if (vd->vdev_path != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
-		    vd->vdev_path) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 
 	if (vd->vdev_devid != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
-		    vd->vdev_devid) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 
 	if (vd->vdev_physpath != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-		    vd->vdev_physpath) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+		    vd->vdev_physpath);
 
 	if (vd->vdev_fru != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
-		    vd->vdev_fru) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
@@ -267,44 +262,41 @@
 		 * that only support a single parity device -- older software
 		 * will just ignore it.
 		 */
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    vd->vdev_nparity) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 	}
 
 	if (vd->vdev_wholedisk != -1ULL)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    vd->vdev_wholedisk) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    vd->vdev_wholedisk);
 
 	if (vd->vdev_not_present)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    vd->vdev_ms_array) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    vd->vdev_ms_shift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
-		    vd->vdev_ashift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    vd->vdev_asize) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
-		    vd->vdev_islog) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vd->vdev_ms_array);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vd->vdev_ms_shift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    vd->vdev_asize);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_removing)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
-			    vd->vdev_removing) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+			    vd->vdev_removing);
 	}
 
-	if (vd->vdev_dtl_smo.smo_object != 0)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl_smo.smo_object) == 0);
+	if (vd->vdev_dtl_sm != NULL) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+		    space_map_object(vd->vdev_dtl_sm));
+	}
 
 	if (vd->vdev_crtxg)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
-		    vd->vdev_crtxg) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 
 	if (getstats) {
 		vdev_stat_t vs;
@@ -311,15 +303,14 @@
 		pool_scan_stat_t ps;
 
 		vdev_get_stats(vd, &vs);
-		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
-		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
 
 		/* provide either current or previous scan information */
 		if (spa_scan_get_stats(spa, &ps) == 0) {
-			VERIFY(nvlist_add_uint64_array(nv,
+			fnvlist_add_uint64_array(nv,
 			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
-			    sizeof (pool_scan_stat_t) / sizeof (uint64_t))
-			    == 0);
+			    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 		}
 	}
 
@@ -349,8 +340,8 @@
 		}
 
 		if (idx) {
-			VERIFY(nvlist_add_nvlist_array(nv,
-			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+			    child, idx);
 		}
 
 		for (c = 0; c < idx; c++)
@@ -362,26 +353,20 @@
 		const char *aux = NULL;
 
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-			    B_TRUE) == 0);
-		if (vd->vdev_resilvering)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+		if (vd->vdev_resilver_txg != 0)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+			    vd->vdev_resilver_txg);
 		if (vd->vdev_faulted)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
 		if (vd->vdev_degraded)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
 		if (vd->vdev_removed)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
 		if (vd->vdev_unspare)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
 		if (vd->vdev_ishole)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 
 		switch (vd->vdev_stat.vs_aux) {
 		case VDEV_AUX_ERR_EXCEEDED:
@@ -394,12 +379,11 @@
 		}
 
 		if (aux != NULL)
-			VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
-			    aux) == 0);
+			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 
 		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
-			    vd->vdev_orig_guid) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+			    vd->vdev_orig_guid);
 		}
 	}
 
@@ -619,7 +603,8 @@
 	 * read-only.  Instead we look to see if the pools is marked
 	 * read-only in the namespace and set the state to active.
 	 */
-	if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
 	    spa_mode(spa) == FREAD)
 		state = POOL_STATE_ACTIVE;
 
@@ -663,7 +648,7 @@
 	/* Track the creation time for this vdev */
 	vd->vdev_crtxg = crtxg;
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
 		return (0);
 
 	/*
@@ -730,8 +715,9 @@
 	 * Don't TRIM if removing so that we don't interfere with zpool
 	 * disaster recovery.
 	 */
-	if (zfs_trim_enabled && vdev_trim_on_init && (reason == VDEV_LABEL_CREATE ||
-	    reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE))
+	if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && 
+	    (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
+	    reason == VDEV_LABEL_L2CACHE))
 		zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize));
 
 	/*
@@ -872,6 +858,44 @@
 	return (error);
 }
 
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+	zio_t *zio;
+	char *pad2;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int error;
+
+	if (size > VDEV_PAD_SIZE)
+		return (EINVAL);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (ENODEV);
+	if (vdev_is_dead(vd))
+		return (ENXIO);
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+	bzero(pad2, VDEV_PAD_SIZE);
+	memcpy(pad2, buf, size);
+
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+	vdev_label_write(zio, vd, 0, pad2,
+	    offsetof(vdev_label_t, vl_pad2),
+	    VDEV_PAD_SIZE, NULL, NULL, flags);
+	error = zio_wait(zio);
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	zio_buf_free(pad2, VDEV_PAD_SIZE);
+	return (error);
+}
+
 /*
  * ==========================================================================
  * uberblock load/sync
@@ -1009,7 +1033,7 @@
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
@@ -1084,7 +1108,7 @@
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
@@ -1209,15 +1233,16 @@
  * at any time, you can just call it again, and it will resume its work.
  */
 int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
 	vdev_t *vd;
 	zio_t *zio;
-	int error;
+	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
@@ -1225,8 +1250,11 @@
 	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
 	 * bailing out and declaring the pool faulted.
 	 */
-	if (tryhard)
+	if (error != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0)
+			return (error);
 		flags |= ZIO_FLAG_TRYHARD;
+	}
 
 	ASSERT(ub->ub_txg <= txg);
 
@@ -1270,7 +1298,7 @@
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
-		return (error);
+		goto retry;
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
@@ -1288,7 +1316,7 @@
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
-		return (error);
+		goto retry;
 
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
@@ -1301,7 +1329,7 @@
 	 * stable storage before the next transaction group begins.
 	 */
 	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
-		return (error);
+		goto retry;;
 
 	trim_thread_wakeup(spa);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,11 +25,14 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
@@ -42,6 +45,7 @@
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
@@ -48,21 +52,102 @@
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
-	int		mm_replacing;
-	int		mm_preferred;
-	int		mm_root;
-	mirror_child_t	mm_child[1];
+	boolean_t	mm_resilvering;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+static int vdev_mirror_shift = 21;
 
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV Mirror");
+#endif
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_inc", &rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RW,
+    &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+#endif
+
+static int rotating_seek_inc = 5;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_inc", &rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RW,
+    &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+#endif
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_offset", &rotating_seek_offset);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RW,
+    &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+    "triggers a reduced rotating media seek increment");
+#endif
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_inc", &non_rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RW,
+    &non_rotating_inc, 0,
+    "Non-rotating media load increment for non-seeking I/O's");
+#endif
+
+static int non_rotating_seek_inc = 1;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_seek_inc",
+     &non_rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RW,
+    &non_rotating_seek_inc, 0,
+    "Non-rotating media load increment for seeking I/O's");
+#endif
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_resilvering = resilvering;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm + 
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return mm;
+}
+
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
@@ -70,55 +155,110 @@
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t lastoffset;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	lastoffset = vdev_queue_lastoffset(vd);
+
+	if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
+		/* Non-rotating media. */
+		if (lastoffset == zio_offset)
+			return (load + non_rotating_inc);
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O'a can be aggregated into fewer operations
+		 * on the device, thus avoiding unnecessary per-command
+		 * overhead and boosting performance.
+		 */
+		return (load + non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (lastoffset == zio_offset)
+		return (load + rotating_inc);
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O queued to this vdev as they should incure less
+	 * of a seek increment.
+	 */
+	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+		return (load + (rotating_seek_inc / 2));
+
+	/* Apply the full seek increment to all other I/O's. */
+	return (load + rotating_seek_inc);
+}
+
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
+		    B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
-
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		/*
+		 * If we are resilvering, then we should handle scrub reads
+		 * differently; we shouldn't issue them to the resilvering
+		 * device because it might not have those blocks.
+		 *
+		 * We are resilvering iff:
+		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
+		 *    "spare-1" or something like that), and
+		 * 2) The pool is currently being resilvered.
+		 *
+		 * We cannot simply check vd->vdev_resilver_txg, because it's
+		 * not set in this path.
+		 *
+		 * Nor can we just check our vdev_ops; there are cases (such as
+		 * when a user types "zpool replace pool odev spare_dev" and
+		 * spare_dev is in the spare list, or when a spare device is
+		 * automatically used to replace a DEGRADED device) when
+		 * resilvering is complete but both the original vdev and the
+		 * spare vdev remain in the pool.  That behavior is intentional.
+		 * It helps implement the policy that a spare should be
+		 * automatically removed from the pool after the user replaces
+		 * the device that originally failed.
+		 *
+		 * If a spa load is in progress, then spa_dsl_pool may be
+		 * uninitialized.  But we shouldn't be resilvering during a spa
+		 * load anyway.
+		 */
+		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+		    vd->vdev_ops == &vdev_spare_ops) &&
+		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);		
+		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -133,7 +273,7 @@
 
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
@@ -156,7 +296,9 @@
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
@@ -191,9 +333,10 @@
 
 	if (zio->io_error == 0) {
 		zio_t *pio;
+		zio_link_t *zl = NULL;
 
 		mutex_enter(&zio->io_lock);
-		while ((pio = zio_walk_parents(zio)) != NULL) {
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 			mutex_enter(&pio->io_lock);
 			ASSERT3U(zio->io_size, >=, pio->io_size);
 			bcopy(zio->io_data, pio->io_data, pio->io_size);
@@ -210,7 +353,54 @@
 }
 
 /*
- * Try to find a child whose DTL doesn't contain the block we want to read.
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p-- ; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * If we can't, try the read on any vdev we haven't already tried.
  */
 static int
@@ -217,23 +407,20 @@
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
+
 		if (!vdev_readable(mc->mc_vd)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
@@ -240,20 +427,50 @@
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
-			return (c);
-		mc->mc_error = SET_ERROR(ESTALE);
-		mc->mc_skipped = 1;
-		mc->mc_speculative = 1;
+
+		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
 	}
 
+	if (mm->mm_preferred_cnt == 1) {
+		vdev_queue_register_lastoffset(
+		    mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		int c = vdev_mirror_preferred_child_randomize(zio);
+
+		vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+		return (c);
+	}
+
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
-	for (c = 0; c < mm->mm_children; c++)
-		if (!mm->mm_child[c].mc_tried)
+	for (c = 0; c < mm->mm_children; c++) {
+		if (!mm->mm_child[c].mc_tried) {
+			vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+			    zio);
 			return (c);
+		}
+	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -261,7 +478,7 @@
 	return (-1);
 }
 
-static int
+static void
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
@@ -268,10 +485,11 @@
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_alloc(zio);
+	mm = vdev_mirror_map_init(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
+		    mm->mm_children > 1) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
@@ -286,7 +504,8 @@
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_mirror_scrub_done, mc));
 			}
-			return (ZIO_PIPELINE_CONTINUE);
+			zio_execute(zio);
+			return;
 		}
 		/*
 		 * For normal reads just pick one child.
@@ -313,7 +532,7 @@
 		c++;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_execute(zio);
 }
 
 static int
@@ -409,7 +628,7 @@
 	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
-	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
@@ -436,7 +655,7 @@
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_data, zio->io_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -46,7 +46,7 @@
 /* ARGSUSED */
 static int
 vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	/*
 	 * Really this should just fail.  But then the root vdev will be in the
@@ -56,7 +56,8 @@
 	 */
 	*psize = 0;
 	*max_psize = 0;
-	*ashift = 0;
+	*logical_ashift = 0;
+	*physical_ashift = 0;
 	return (0);
 }
 
@@ -67,11 +68,11 @@
 }
 
 /* ARGSUSED */
-static int
+static void
 vdev_missing_io_start(zio_t *zio)
 {
 	zio->io_error = SET_ERROR(ENOTSUP);
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_execute(zio);
 }
 
 /* ARGSUSED */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,87 +25,295 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
 
 /*
- * These tunables are for performance analysis.
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
+ * I/O scheduler determines when and in what order those operations are
+ * issued.  The I/O scheduler divides operations into six I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, scrub/resilver and trim.  Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum, and if the
+ * aggregate maximum is equal to or greater than the sum of the per-queue
+ * maximums, the per-queue minimum has no effect.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write I/Os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ *        |                   o---------| <-- zfs_vdev_async_write_max_active
+ *   ^    |                  /^         |
+ *   |    |                 / |         |
+ * active |                /  |         |
+ *  I/O   |               /   |         |
+ * count  |              /    |         |
+ *        |             /     |         |
+ *        |------------o      |         | <-- zfs_vdev_async_write_min_active
+ *       0|____________^______|_________|
+ *        0%           |      |       100% of zfs_dirty_data_max
+ *                     |      |
+ *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
+ *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
  */
 
-/* The maximum number of I/Os concurrently pending to each device. */
-int zfs_vdev_max_pending = 10;
-
 /*
- * The initial number of I/Os pending to each device, before it starts ramping
- * up to zfs_vdev_max_pending.
+ * The maximum number of I/Os active to each device.  Ideally, this will be >=
+ * the sum of each queue's max_active.  It must be at least the sum of each
+ * queue's min_active.
  */
-int zfs_vdev_min_pending = 4;
+uint32_t zfs_vdev_max_active = 1000;
 
 /*
- * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
- * deadline = pri + gethrtime() >> time_shift)
+ * Per-queue limits on the number of I/Os active to each device.  If the
+ * sum of the queue's max_active is < zfs_vdev_max_active, then the
+ * min_active comes into play.  We will send min_active from each queue,
+ * and then select from queues in the order defined by zio_priority_t.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations.  Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs.  E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
  */
-int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
+/*
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
+ */
+uint32_t zfs_vdev_trim_max_active = 64;
 
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
 
 /*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
+
+/*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
  * For read I/Os, we also aggregate across small adjacency gaps; for writes
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
 SYSCTL_DECL(_vfs_zfs_vdev);
-TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW,
-    &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
-TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW,
-    &zfs_vdev_min_pending, 0,
-    "Initial number of I/O requests pending to each device");
-TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW,
-    &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
-TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW,
-    &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
+
+TUNABLE_INT("vfs.zfs.vdev.async_write_active_min_dirty_percent",
+    &zfs_vdev_async_write_active_min_dirty_percent);
+static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_async_write_active_min_dirty_percent, "I",
+    "Percentage of async write dirty data below which "
+    "async_write_min_active is used.");
+
+TUNABLE_INT("vfs.zfs.vdev.async_write_active_max_dirty_percent",
+    &zfs_vdev_async_write_active_max_dirty_percent);
+static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_async_write_active_max_dirty_percent, "I",
+    "Percentage of async write dirty data above which "
+    "async_write_max_active is used.");
+
+TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active);
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
+    &zfs_vdev_max_active, 0,
+    "The maximum number of I/Os of all types active for each device.");
+
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
+TUNABLE_INT("vfs.zfs.vdev." #name "_min_active",			\
+    &zfs_vdev_ ## name ## _min_active);					\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active,		\
+    CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _min_active, 0,		\
+    "Initial number of I/O requests of type " #name			\
+    " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
+TUNABLE_INT("vfs.zfs.vdev." #name "_max_active",			\
+    &zfs_vdev_ ## name ## _max_active);					\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active,		\
+    CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _max_active, 0,		\
+    "Maximum number of I/O requests of type " #name			\
+    " active for each device");
+
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
+ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
 TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
     &zfs_vdev_aggregation_limit, 0,
     "I/O requests are aggregated up to this size");
 TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
     &zfs_vdev_read_gap_limit, 0,
     "Acceptable gap between two reads being aggregated");
 TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
     &zfs_vdev_write_gap_limit, 0,
     "Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+    &zfs_vdev_queue_depth_pct, 0,
+    "Queue depth percentage for each top-level");
 
-/*
- * Virtual device vector for disk I/O scheduling.
- */
+static int
+sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_vdev_async_write_active_min_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	
+	if (val < 0 || val > 100 ||
+	    val >= zfs_vdev_async_write_active_max_dirty_percent)
+		return (EINVAL);
+
+	zfs_vdev_async_write_active_min_dirty_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_vdev_async_write_active_max_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 0 || val > 100 ||
+	    val <= zfs_vdev_async_write_active_min_dirty_percent)
+		return (EINVAL);
+
+	zfs_vdev_async_write_active_max_dirty_percent = val;
+
+	return (0);
+}
+#endif
+#endif
+
 int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
+vdev_queue_offset_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
-	if (z1->io_deadline < z2->io_deadline)
-		return (-1);
-	if (z1->io_deadline > z2->io_deadline)
-		return (1);
-
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
@@ -119,12 +327,34 @@
 	return (0);
 }
 
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+	return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+	if (t == ZIO_TYPE_READ)
+		return (&vq->vq_read_offset_tree);
+	else if (t == ZIO_TYPE_WRITE)
+		return (&vq->vq_write_offset_tree);
+	else
+		return (NULL);
+}
+
 int
-vdev_queue_offset_compare(const void *x1, const void *x2)
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
+	if (z1->io_timestamp < z2->io_timestamp)
+		return (-1);
+	if (z1->io_timestamp > z2->io_timestamp)
+		return (1);
+
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
@@ -144,18 +374,35 @@
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+	vq->vq_vdev = vd;
 
-	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
-	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		int (*compfn) (const void *, const void *);
 
-	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+		/*
+		 * The synchronous i/o queues are dispatched in FIFO rather
+		 * than LBA order.  This provides more consistent latency for
+		 * these i/os.
+		 */
+		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+			compfn = vdev_queue_timestamp_compare;
+		else
+			compfn = vdev_queue_offset_compare;
 
-	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+		avl_create(vdev_queue_class_tree(vq, p), compfn,
+		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	}
+
+	vq->vq_lastoffset = 0;
 }
 
 void
@@ -163,10 +410,11 @@
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
-	avl_destroy(&vq->vq_deadline_tree);
-	avl_destroy(&vq->vq_read_tree);
-	avl_destroy(&vq->vq_write_tree);
-	avl_destroy(&vq->vq_pending_tree);
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+		avl_destroy(vdev_queue_class_tree(vq, p));
+	avl_destroy(&vq->vq_active_tree);
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
 	mutex_destroy(&vq->vq_lock);
 }
@@ -174,31 +422,232 @@
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	avl_add(&vq->vq_deadline_tree, zio);
-	avl_add(zio->io_vdev_tree, zio);
+	spa_t *spa = zio->io_spa;
+	avl_tree_t *qtt;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	qtt = vdev_queue_type_tree(vq, zio->io_type);
+	if (qtt)
+		avl_add(qtt, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	spa->spa_queue_stats[zio->io_priority].spa_queued++;
+	if (spa->spa_iokstat != NULL)
+		kstat_waitq_enter(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	avl_remove(&vq->vq_deadline_tree, zio);
-	avl_remove(zio->io_vdev_tree, zio);
+	spa_t *spa = zio->io_spa;
+	avl_tree_t *qtt;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	qtt = vdev_queue_type_tree(vq, zio->io_type);
+	if (qtt)
+		avl_remove(qtt, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
+	spa->spa_queue_stats[zio->io_priority].spa_queued--;
+	if (spa->spa_iokstat != NULL)
+		kstat_waitq_exit(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
 }
 
 static void
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active++;
+	avl_add(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	spa->spa_queue_stats[zio->io_priority].spa_active++;
+	if (spa->spa_iokstat != NULL)
+		kstat_runq_enter(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active--;
+	avl_remove(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
+	spa->spa_queue_stats[zio->io_priority].spa_active--;
+	if (spa->spa_iokstat != NULL) {
+		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
+
+		kstat_runq_exit(spa->spa_iokstat->ks_data);
+		if (zio->io_type == ZIO_TYPE_READ) {
+			ksio->reads++;
+			ksio->nread += zio->io_size;
+		} else if (zio->io_type == ZIO_TYPE_WRITE) {
+			ksio->writes++;
+			ksio->nwritten += zio->io_size;
+		}
+	}
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
-	zio_t *pio;
-
-	while ((pio = zio_walk_parents(aio)) != NULL)
-		if (aio->io_type == ZIO_TYPE_READ)
+	if (aio->io_type == ZIO_TYPE_READ) {
+		zio_t *pio;
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 			bcopy((char *)aio->io_data + (pio->io_offset -
 			    aio->io_offset), pio->io_data, pio->io_size);
+		}
+	}
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
 
+static int
+vdev_queue_class_min_active(zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_min_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_min_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_min_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (zfs_vdev_async_write_min_active);
+	case ZIO_PRIORITY_SCRUB:
+		return (zfs_vdev_scrub_min_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_min_active);
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
+static __noinline int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+	int writes;
+	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
+	uint64_t min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+	uint64_t max_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+	/*
+	 * Sync tasks correspond to interactive user actions. To reduce the
+	 * execution time of those actions we push data out as fast as possible.
+	 */
+	if (spa_has_pending_synctask(spa)) {
+		return (zfs_vdev_async_write_max_active);
+	}
+
+	if (dirty < min_bytes)
+		return (zfs_vdev_async_write_min_active);
+	if (dirty > max_bytes)
+		return (zfs_vdev_async_write_max_active);
+
+	/*
+	 * linear interpolation:
+	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+	 * move right by min_bytes
+	 * move up by min_writes
+	 */
+	writes = (dirty - min_bytes) *
+	    (zfs_vdev_async_write_max_active -
+	    zfs_vdev_async_write_min_active) /
+	    (max_bytes - min_bytes) +
+	    zfs_vdev_async_write_min_active;
+	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+	return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_max_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_max_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_max_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (vdev_queue_max_async_writes(spa));
+	case ZIO_PRIORITY_SCRUB:
+		return (zfs_vdev_scrub_max_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_max_active);
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
 /*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+	spa_t *spa = vq->vq_vdev->vdev_spa;
+	zio_priority_t p;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+		return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+	/* find a queue that has not reached its minimum # outstanding i/os */
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_min_active(p))
+			return (p);
+	}
+
+	/*
+	 * If we haven't found a queue, look for one that hasn't reached its
+	 * maximum # outstanding i/os.
+	 */
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_max_active(spa, p))
+			return (p);
+	}
+
+	/* No eligible queued i/os */
+	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
+/*
  * Compute the range spanned by two i/os, which is the endpoint of the last
  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
@@ -208,155 +657,193 @@
 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 {
-	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
+	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+	void *abuf;
+	uint64_t maxgap = 0;
+	uint64_t size;
+	boolean_t stretch;
 	avl_tree_t *t;
-	int flags;
-	uint64_t maxspan = zfs_vdev_aggregation_limit;
-	uint64_t maxgap;
-	int stretch;
+	enum zio_flag flags;
 
-again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
-	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
-	    avl_numnodes(&vq->vq_deadline_tree) == 0)
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
 
-	fio = lio = avl_first(&vq->vq_deadline_tree);
+	first = last = zio;
 
-	t = fio->io_vdev_tree;
-	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
-	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+	if (zio->io_type == ZIO_TYPE_READ)
+		maxgap = zfs_vdev_read_gap_limit;
 
-	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
-		/*
-		 * We can aggregate I/Os that are sufficiently adjacent and of
-		 * the same flavor, as expressed by the AGG_INHERIT flags.
-		 * The latter requirement is necessary so that certain
-		 * attributes of the I/O, such as whether it's a normal I/O
-		 * or a scrub/resilver, can be preserved in the aggregate.
-		 * We can include optional I/Os, but don't allow them
-		 * to begin a range as they add no benefit in that situation.
-		 */
+	/*
+	 * We can aggregate I/Os that are sufficiently adjacent and of
+	 * the same flavor, as expressed by the AGG_INHERIT flags.
+	 * The latter requirement is necessary so that certain
+	 * attributes of the I/O, such as whether it's a normal I/O
+	 * or a scrub/resilver, can be preserved in the aggregate.
+	 * We can include optional I/Os, but don't allow them
+	 * to begin a range as they add no benefit in that situation.
+	 */
 
-		/*
-		 * We keep track of the last non-optional I/O.
-		 */
-		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+	/*
+	 * We keep track of the last non-optional I/O.
+	 */
+	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 
-		/*
-		 * Walk backwards through sufficiently contiguous I/Os
-		 * recording the last non-option I/O.
-		 */
-		while ((dio = AVL_PREV(t, fio)) != NULL &&
-		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(dio, lio) <= maxspan &&
-		    IO_GAP(dio, fio) <= maxgap) {
-			fio = dio;
-			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
-				mio = fio;
-		}
+	/*
+	 * Walk backwards through sufficiently contiguous I/Os
+	 * recording the last non-option I/O.
+	 */
+	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	t = vdev_queue_type_tree(vq, zio->io_type);
+	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
+	    IO_GAP(dio, first) <= maxgap) {
+		first = dio;
+		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = first;
+	}
 
-		/*
-		 * Skip any initial optional I/Os.
-		 */
-		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
-			fio = AVL_NEXT(t, fio);
-			ASSERT(fio != NULL);
-		}
+	/*
+	 * Skip any initial optional I/Os.
+	 */
+	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+		first = AVL_NEXT(t, first);
+		ASSERT(first != NULL);
+	}
 
-		/*
-		 * Walk forward through sufficiently contiguous I/Os.
-		 */
-		while ((dio = AVL_NEXT(t, lio)) != NULL &&
-		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(fio, dio) <= maxspan &&
-		    IO_GAP(lio, dio) <= maxgap) {
-			lio = dio;
-			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
-				mio = lio;
-		}
+	/*
+	 * Walk forward through sufficiently contiguous I/Os.
+	 */
+	while ((dio = AVL_NEXT(t, last)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
+	    IO_GAP(last, dio) <= maxgap) {
+		last = dio;
+		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = last;
+	}
 
-		/*
-		 * Now that we've established the range of the I/O aggregation
-		 * we must decide what to do with trailing optional I/Os.
-		 * For reads, there's nothing to do. While we are unable to
-		 * aggregate further, it's possible that a trailing optional
-		 * I/O would allow the underlying device to aggregate with
-		 * subsequent I/Os. We must therefore determine if the next
-		 * non-optional I/O is close enough to make aggregation
-		 * worthwhile.
-		 */
-		stretch = B_FALSE;
-		if (t != &vq->vq_read_tree && mio != NULL) {
-			nio = lio;
-			while ((dio = AVL_NEXT(t, nio)) != NULL &&
-			    IO_GAP(nio, dio) == 0 &&
-			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
-				nio = dio;
-				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
-					stretch = B_TRUE;
-					break;
-				}
+	/*
+	 * Now that we've established the range of the I/O aggregation
+	 * we must decide what to do with trailing optional I/Os.
+	 * For reads, there's nothing to do. While we are unable to
+	 * aggregate further, it's possible that a trailing optional
+	 * I/O would allow the underlying device to aggregate with
+	 * subsequent I/Os. We must therefore determine if the next
+	 * non-optional I/O is close enough to make aggregation
+	 * worthwhile.
+	 */
+	stretch = B_FALSE;
+	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+		zio_t *nio = last;
+		while ((dio = AVL_NEXT(t, nio)) != NULL &&
+		    IO_GAP(nio, dio) == 0 &&
+		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+			nio = dio;
+			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+				stretch = B_TRUE;
+				break;
 			}
 		}
+	}
 
-		if (stretch) {
-			/* This may be a no-op. */
-			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
-			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
-		} else {
-			while (lio != mio && lio != fio) {
-				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
-				lio = AVL_PREV(t, lio);
-				ASSERT(lio != NULL);
-			}
+	if (stretch) {
+		/* This may be a no-op. */
+		dio = AVL_NEXT(t, last);
+		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+	} else {
+		while (last != mandatory && last != first) {
+			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+			last = AVL_PREV(t, last);
+			ASSERT(last != NULL);
 		}
 	}
 
-	if (fio != lio) {
-		uint64_t size = IO_SPAN(fio, lio);
-		ASSERT(size <= zfs_vdev_aggregation_limit);
+	if (first == last)
+		return (NULL);
 
-		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
-		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
-		    vdev_queue_agg_io_done, NULL);
-		aio->io_timestamp = fio->io_timestamp;
+	size = IO_SPAN(first, last);
+	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
 
-		nio = fio;
-		do {
-			dio = nio;
-			nio = AVL_NEXT(t, dio);
-			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == t);
+	abuf = zio_buf_alloc_nowait(size);
+	if (abuf == NULL)
+		return (NULL);
 
-			if (dio->io_flags & ZIO_FLAG_NODATA) {
-				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
-				bzero((char *)aio->io_data + (dio->io_offset -
-				    aio->io_offset), dio->io_size);
-			} else if (dio->io_type == ZIO_TYPE_WRITE) {
-				bcopy(dio->io_data, (char *)aio->io_data +
-				    (dio->io_offset - aio->io_offset),
-				    dio->io_size);
-			}
+	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+	    abuf, size, first->io_type, zio->io_priority,
+	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+	    vdev_queue_agg_io_done, NULL);
+	aio->io_timestamp = first->io_timestamp;
 
-			zio_add_child(dio, aio);
-			vdev_queue_io_remove(vq, dio);
-			zio_vdev_io_bypass(dio);
-			zio_execute(dio);
-		} while (dio != lio);
+	nio = first;
+	do {
+		dio = nio;
+		nio = AVL_NEXT(t, dio);
+		ASSERT3U(dio->io_type, ==, aio->io_type);
 
-		avl_add(&vq->vq_pending_tree, aio);
+		if (dio->io_flags & ZIO_FLAG_NODATA) {
+			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+			bzero((char *)aio->io_data + (dio->io_offset -
+			    aio->io_offset), dio->io_size);
+		} else if (dio->io_type == ZIO_TYPE_WRITE) {
+			bcopy(dio->io_data, (char *)aio->io_data +
+			    (dio->io_offset - aio->io_offset),
+			    dio->io_size);
+		}
 
-		return (aio);
+		zio_add_child(dio, aio);
+		vdev_queue_io_remove(vq, dio);
+		zio_vdev_io_bypass(dio);
+		zio_execute(dio);
+	} while (dio != last);
+
+	return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+	zio_t *zio, *aio;
+	zio_priority_t p;
+	avl_index_t idx;
+	avl_tree_t *tree;
+	zio_t search;
+
+again:
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	p = vdev_queue_class_to_issue(vq);
+
+	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+		/* No eligible queued i/os */
+		return (NULL);
 	}
 
-	ASSERT(fio->io_vdev_tree == t);
-	vdev_queue_io_remove(vq, fio);
+	/*
+	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
+	 * the most recently issued i/o in LBA (offset) order.
+	 *
+	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+	 */
+	tree = vdev_queue_class_tree(vq, p);
+	search.io_timestamp = 0;
+	search.io_offset = vq->vq_last_offset + 1;
+	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+	zio = avl_nearest(tree, idx, AVL_AFTER);
+	if (zio == NULL)
+		zio = avl_first(tree);
+	ASSERT3U(zio->io_priority, ==, p);
 
+	aio = vdev_queue_aggregate(vq, zio);
+	if (aio != NULL)
+		zio = aio;
+	else
+		vdev_queue_io_remove(vq, zio);
+
 	/*
 	 * If the I/O is or was optional and therefore has no data, we need to
 	 * simply discard it. We need to drop the vdev queue's lock to avoid a
@@ -363,17 +850,18 @@
 	 * deadlock that we could encounter since this I/O will complete
 	 * immediately.
 	 */
-	if (fio->io_flags & ZIO_FLAG_NODATA) {
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		mutex_exit(&vq->vq_lock);
-		zio_vdev_io_bypass(fio);
-		zio_execute(fio);
+		zio_vdev_io_bypass(zio);
+		zio_execute(zio);
 		mutex_enter(&vq->vq_lock);
 		goto again;
 	}
 
-	avl_add(&vq->vq_pending_tree, fio);
+	vdev_queue_pending_add(vq, zio);
+	vq->vq_last_offset = zio->io_offset;
 
-	return (fio);
+	return (zio);
 }
 
 zio_t *
@@ -382,28 +870,33 @@
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 		return (zio);
 
+	/*
+	 * Children i/os inherent their parent's priority, which might
+	 * not match the child's i/o type.  Fix it up here.
+	 */
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_SCRUB)
+			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
+			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_FREE);
+		zio->io_priority = ZIO_PRIORITY_TRIM;
+	}
+
 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 
-	if (zio->io_type == ZIO_TYPE_READ)
-		zio->io_vdev_tree = &vq->vq_read_tree;
-	else
-		zio->io_vdev_tree = &vq->vq_write_tree;
-
 	mutex_enter(&vq->vq_lock);
-
 	zio->io_timestamp = gethrtime();
-	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
-	    zio->io_priority;
-
 	vdev_queue_io_add(vq, zio);
-
-	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
-
+	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
 
 	if (nio == NULL)
@@ -421,20 +914,15 @@
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
 
-	if (zio_injection_enabled)
-		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
-
 	mutex_enter(&vq->vq_lock);
 
-	avl_remove(&vq->vq_pending_tree, zio);
+	vdev_queue_pending_remove(vq, zio);
 
 	vq->vq_io_complete_ts = gethrtime();
 
-	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
-		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
-		if (nio == NULL)
-			break;
+	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {
 			zio_nowait(nio);
@@ -447,3 +935,26 @@
 
 	mutex_exit(&vq->vq_lock);
 }
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */ 
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,16 +22,24 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#ifdef illumos
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
+#include <sys/bio.h>
 
 /*
  * Virtual device vector for RAID-Z.
@@ -155,6 +163,8 @@
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
@@ -438,14 +448,14 @@
  * the number of children in the target vdev.
  */
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
-	uint64_t b = zio->io_offset >> unit_shift;
+	uint64_t b = offset >> unit_shift;
 	/* The zio's size in units of the vdev's minimum sector size. */
-	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t s = size >> unit_shift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
@@ -533,13 +543,13 @@
 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 	ASSERT3U(rm->rm_nskip, <=, nparity);
 
-	if (zio->io_type != ZIO_TYPE_FREE) {
+	if (!dofree) {
 		for (c = 0; c < rm->rm_firstdatacol; c++) {
 			rm->rm_col[c].rc_data =
 			    zio_buf_alloc(rm->rm_col[c].rc_size);
 		}
 
-		rm->rm_col[c].rc_data = zio->io_data;
+		rm->rm_col[c].rc_data = data;
 
 		for (c = c + 1; c < acols; c++) {
 			rm->rm_col[c].rc_data =
@@ -571,7 +581,7 @@
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -583,8 +593,6 @@
 			rm->rm_skipstart = 1;
 	}
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -994,12 +1002,9 @@
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
- *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
- *           |  0   1   0   0   0   0   0   0  |
- *  (V|I)' = |  0   0   1   0   0   0   0   0  |
- *           |  0   0   0   1   0   0   0   0  |
+ *  (V|I)' = |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
@@ -1479,7 +1484,7 @@
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
@@ -1508,7 +1513,9 @@
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
 	}
 
 	*asize *= vd->vdev_children;
@@ -1531,6 +1538,154 @@
 		vdev_close(vd->vdev_child[c]);
 }
 
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible.  In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots.  For that reason, this function eschews parity for
+ * performance and simplicity.  The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned.  In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs.  8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ *       parity    data     data     data     data
+ *     |   PP   |   XX   |   XX   |   XX   |   XX   |
+ *         ^        ^        ^        ^        ^
+ *         |        |        |        |        |
+ *   8 KB parity    ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ *       parity    data     data     data     data
+ *     |        |        |        |   XX   |        |
+ *                                    ^
+ *                                    |
+ *                             32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ *       parity   parity   parity    data     data     data     data
+ *     |        |        |        |        |        |   XX   |        |
+ *                                                      ^
+ *                                                      |
+ *                                               32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, err = 0;
+
+	uint64_t start, end, colstart, colend;
+	uint64_t coloffset, colsize, colskip;
+
+	int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef	_KERNEL
+
+	/*
+	 * Don't write past the end of the block
+	 */
+	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
+
+	start = offset;
+	end = start + size;
+
+	/*
+	 * Allocate a RAID-Z map for this block.  Note that this block starts
+	 * from the "original" offset, this is, the offset of the extent which
+	 * contains the requisite offset of the data being read or written.
+	 *
+	 * Even if this I/O operation doesn't span the full block size, let's
+	 * treat the on-disk format as if the only blocks are the complete 128
+	 * KB size.
+	 */
+	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+	    vd->vdev_children, vd->vdev_nparity);
+
+	coloffset = origoffset;
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	    c++, coloffset += rc->rc_size) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+
+		/*
+		 * Find the start and end of this column in the RAID-Z map,
+		 * keeping in mind that the stated size and offset of the
+		 * operation may not fill the entire column for this vdev.
+		 *
+		 * If any portion of the data spans this column, issue the
+		 * appropriate operation to the vdev.
+		 */
+		if (coloffset + rc->rc_size <= start)
+			continue;
+		if (coloffset >= end)
+			continue;
+
+		colstart = MAX(coloffset, start);
+		colend = MIN(end, coloffset + rc->rc_size);
+		colsize = colend - colstart;
+		colskip = colstart - coloffset;
+
+		VERIFY3U(colsize, <=, rc->rc_size);
+		VERIFY3U(colskip, <=, rc->rc_size);
+
+		/*
+		 * Note that the child vdev will have a vdev label at the start
+		 * of its range of offsets, hence the need for
+		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+		 * example of why this calculation is needed.
+		 */
+		if ((err = vdev_disk_physio(cvd,
+		    ((char *)rc->rc_data) + colskip, colsize,
+		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+		    flags, isdump)) != 0)
+			break;
+	}
+
+	vdev_raidz_map_free(rm);
+#endif	/* KERNEL */
+
+	return (err);
+}
+#endif
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1573,7 +1728,7 @@
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
-static int
+static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1583,9 +1738,14 @@
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	    zio->io_type == ZIO_TYPE_FREE,
+	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_FREE) {
@@ -1597,7 +1757,9 @@
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
-		return (ZIO_PIPELINE_CONTINUE);
+
+		zio_execute(zio);
+		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1629,7 +1791,8 @@
 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1669,7 +1832,7 @@
 		}
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_execute(zio);
 }
 
 
@@ -1728,6 +1891,13 @@
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
@@ -2205,7 +2375,7 @@
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -56,7 +56,7 @@
 
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *ashift)
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int lasterror = 0;
 	int numerrors = 0;
@@ -84,7 +84,8 @@
 
 	*asize = 0;
 	*max_asize = 0;
-	*ashift = 0;
+	*logical_ashift = 0;
+	*physical_ashift = 0;
 
 	return (0);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /*
@@ -51,10 +52,10 @@
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 
-
 void
 fzap_byteswap(void *vbuf, size_t size)
 {
@@ -81,13 +82,12 @@
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	zap->zap_ismicro = FALSE;
 
-	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-	    &zap->zap_f.zap_phys, zap_evict);
+	zap->zap_dbu.dbu_evict_func = zap_evict;
 
 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-	zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
-	zp = zap->zap_f.zap_phys;
+	zp = zap_f_phys(zap);
 	/*
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
@@ -118,7 +118,6 @@
 
 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	l->l_dbuf = db;
-	l->l_phys = db->db_data;
 
 	zap_leaf_init(l, zp->zap_normflags != 0);
 
@@ -164,8 +163,9 @@
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
@@ -271,6 +271,7 @@
 	uint64_t blk, off;
 	int err;
 	dmu_buf_t *db;
+	dnode_t *dn;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -278,8 +279,15 @@
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	/*
+	 * Note: this is equivalent to dmu_buf_hold(), but we use
+	 * _dnode_enter / _by_dnode because it's faster because we don't
+	 * have to hold the dnode.
+	 */
+	dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -293,9 +301,11 @@
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+		err = dmu_buf_hold_by_dnode(dn,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
+		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -326,10 +336,10 @@
 	 * If we are within 2 bits of running out, stop growing, since
 	 * this is already an aberrant condition.
 	 */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 		return (SET_ERROR(ENOSPC));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/*
 		 * We are outgrowing the "embedded" ptrtbl (the one
 		 * stored in the header block).  Give it its own entire
@@ -339,9 +349,9 @@
 		dmu_buf_t *db_new;
 		int err;
 
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk);
+		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
 		newblk = zap_allocate_blocks(zap, 1);
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
@@ -354,17 +364,17 @@
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		dmu_buf_rele(db_new, FTAG);
 
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 
-		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 		    (FZAP_BLOCK_SHIFT(zap)-3));
 
 		return (0);
 	} else {
-		return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    zap_ptrtbl_transfer, tx));
 	}
 }
@@ -374,8 +384,8 @@
 {
 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-	zap->zap_f.zap_phys->zap_num_entries += delta;
+	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+	zap_f_phys(zap)->zap_num_entries += delta;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
@@ -384,16 +394,25 @@
 {
 	uint64_t newblk;
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	newblk = zap->zap_f.zap_phys->zap_freeblk;
-	zap->zap_f.zap_phys->zap_freeblk += nblocks;
+	newblk = zap_f_phys(zap)->zap_freeblk;
+	zap_f_phys(zap)->zap_freeblk += nblocks;
 	return (newblk);
 }
 
+static void
+zap_leaf_pageout(void *dbu)
+{
+	zap_leaf_t *l = dbu;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
 	void *winner;
-	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -401,18 +420,18 @@
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
-	l->l_phys = NULL;
 
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
-	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+	winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
 
-	zap->zap_f.zap_phys->zap_num_leafs++;
+	zap_f_phys(zap)->zap_num_leafs++;
 
 	return (l);
 }
@@ -422,7 +441,7 @@
 {
 	ASSERT(!zap->zap_ismicro);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap->zap_f.zap_phys->zap_num_entries;
+	*count = zap_f_phys(zap)->zap_num_entries;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 	return (0);
 }
@@ -438,16 +457,6 @@
 	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
-	zap_leaf_t *l = vl;
-
-	rw_destroy(&l->l_rwlock);
-	kmem_free(l, sizeof (zap_leaf_t));
-}
-
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
@@ -455,20 +464,20 @@
 
 	ASSERT(blkid != 0);
 
-	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	rw_init(&l->l_rwlock, 0, 0, 0);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = blkid;
-	l->l_bs = highbit(db->db_size)-1;
+	l->l_bs = highbit64(db->db_size) - 1;
 	l->l_dbuf = db;
-	l->l_phys = NULL;
 
-	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+	winner = dmu_buf_set_user(db, &l->l_dbu);
 
 	rw_exit(&l->l_rwlock);
 	if (winner != NULL) {
 		/* someone else set it first */
-		zap_leaf_pageout(NULL, l);
+		zap_leaf_pageout(&l->l_dbu);
 		l = winner;
 	}
 
@@ -477,7 +486,7 @@
 	 * chain.  There should be no chained leafs (as we have removed
 	 * support for them).
 	 */
-	ASSERT0(l->l_phys->l_hdr.lh_pad1);
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 
 	/*
 	 * There should be more hash entries than there can be
@@ -487,11 +496,11 @@
 
 	/* The chunks should begin at the end of the hash table */
 	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
-	    &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 
 	/* The chunks should end at the end of the block */
 	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 
 	return (l);
 }
@@ -507,8 +516,10 @@
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 
@@ -524,7 +535,7 @@
 
 	rw_enter(&l->l_rwlock, lt);
 	/*
-	 * Must lock before dirtying, otherwise l->l_phys could change,
+	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 	 * causing ASSERT below to fail.
 	 */
 	if (lt == RW_WRITER)
@@ -531,9 +542,8 @@
 		dmu_buf_will_dirty(db, tx);
 	ASSERT3U(l->l_blkid, ==, blkid);
 	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
-	ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	*lp = l;
 	return (0);
@@ -544,13 +554,13 @@
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
-		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 		return (0);
 	} else {
-		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, valp));
 	}
 }
@@ -561,11 +571,11 @@
 	ASSERT(tx != NULL);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 		return (0);
 	} else {
-		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, blk, tx));
 	}
 }
@@ -577,21 +587,23 @@
 	int err;
 
 	ASSERT(zap->zap_dbuf == NULL ||
-	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
+	ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
+	idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	err = zap_idx_to_blk(zap, idx, &blk);
 	if (err != 0)
 		return (err);
 	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-	ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
-	    (*lp)->l_phys->l_hdr.lh_prefix);
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 	return (err);
 }
 
 static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
@@ -598,24 +610,24 @@
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
 	uint64_t sibling;
-	int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
-	ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
 	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
 		objset_t *os = zap->zap_objset;
 		uint64_t object = zap->zap_object;
 
 		zap_put_leaf(l);
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, &zn->zn_zap);
+		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err)
 			return (err);
@@ -622,7 +634,7 @@
 		ASSERT(!zap->zap_ismicro);
 
 		while (old_prefix_len ==
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 			err = zap_grow_ptrtbl(zap, tx);
 			if (err)
 				return (err);
@@ -632,7 +644,7 @@
 		if (err)
 			return (err);
 
-		if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
 			*lp = l;
 			return (0);
@@ -639,11 +651,11 @@
 		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+	prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    (old_prefix_len + 1);
 	sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
@@ -665,7 +677,7 @@
 		ASSERT0(err); /* we checked for i/o errors above */
 	}
 
-	if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 		/* we want the sibling */
 		zap_put_leaf(l);
 		*lp = nl;
@@ -678,16 +690,17 @@
 }
 
 static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
-	int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
-	    l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
 	zap_put_leaf(l);
 
-	if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 		int err;
 
 		/*
@@ -698,9 +711,9 @@
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;
 
-			zap_unlockdir(zap);
+			zap_unlockdir(zap, tag);
 			err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err)
 				return;
@@ -707,7 +720,7 @@
 		}
 
 		/* could have finished growing while our locks were down */
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 			(void) zap_grow_ptrtbl(zap, tx);
 	}
 }
@@ -790,7 +803,7 @@
 int
 fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx)
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -819,7 +832,7 @@
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
@@ -827,7 +840,7 @@
 
 out:
 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }
 
@@ -834,7 +847,7 @@
 int
 fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
+    const void *val, void *tag, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
@@ -841,12 +854,13 @@
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tx));
+	    val, ZAP_NEED_CD, tag, tx));
 }
 
 int
 fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err, create;
@@ -876,7 +890,7 @@
 	}
 
 	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
@@ -883,7 +897,7 @@
 	}
 
 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }
 
@@ -938,11 +952,12 @@
 	int bs;
 
 	idx = ZAP_HASH_IDX(zn->zn_hash,
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
 }
 
 /*
@@ -1170,8 +1185,8 @@
 
 	if (zc->zc_leaf &&
 	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
@@ -1192,10 +1207,11 @@
 
 	if (err == ENOENT) {
 		uint64_t nocare =
-		    (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+		    (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
 		zc->zc_cd = 0;
-		if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+		    zc->zc_hash == 0) {
 			zc->zc_hash = -1ULL;
 		} else {
 			zap_put_leaf(zc->zc_leaf);
@@ -1287,25 +1303,25 @@
 	/*
 	 * Set zap_phys_t fields
 	 */
-	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-	zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-	zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-	zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+	zs->zs_magic = zap_f_phys(zap)->zap_magic;
+	zs->zs_salt = zap_f_phys(zap)->zap_salt;
 
 	/*
 	 * Set zap_ptrtbl fields
 	 */
-	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
 	zs->zs_ptrtbl_blks_copied =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/* the ptrtbl is entirely in the header block. */
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
@@ -1312,17 +1328,18 @@
 	} else {
 		int b;
 
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 
-		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
 			int err;
 
 			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
 				zap_stats_ptrtbl(zap, db->db_data,
@@ -1334,8 +1351,8 @@
 }
 
 int
-fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
-    uint64_t *tooverwrite)
+fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+    refcount_t *tooverwrite)
 {
 	zap_t *zap = zn->zn_zap;
 	zap_leaf_t *l;
@@ -1345,9 +1362,11 @@
 	 * Account for the header block of the fatzap.
 	 */
 	if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
-		*tooverwrite += zap->zap_dbuf->db_size;
+		(void) refcount_add_many(tooverwrite,
+		    zap->zap_dbuf->db_size, FTAG);
 	} else {
-		*towrite += zap->zap_dbuf->db_size;
+		(void) refcount_add_many(towrite,
+		    zap->zap_dbuf->db_size, FTAG);
 	}
 
 	/*
@@ -1359,10 +1378,13 @@
 	 *   could extend the table.
 	 */
 	if (add) {
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
-			*towrite += zap->zap_dbuf->db_size;
-		else
-			*towrite += (zap->zap_dbuf->db_size * 3);
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+			(void) refcount_add_many(towrite,
+			    zap->zap_dbuf->db_size, FTAG);
+		} else {
+			(void) refcount_add_many(towrite,
+			    zap->zap_dbuf->db_size * 3, FTAG);
+		}
 	}
 
 	/*
@@ -1375,13 +1397,14 @@
 	}
 
 	if (!add && dmu_buf_freeable(l->l_dbuf)) {
-		*tooverwrite += l->l_dbuf->db_size;
+		(void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
 	} else {
 		/*
 		 * If this an add operation, the leaf block could split.
 		 * Hence, we need to account for an additional leaf block.
 		 */
-		*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+		(void) refcount_add_many(towrite,
+		    (add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
 	}
 
 	zap_put_leaf(l);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -49,10 +49,12 @@
 
 #define	LEAF_HASH(l, h) \
 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+	((h) >> \
+	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
 
-#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+#define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
 
 static void
 zap_memset(void *a, int c, size_t n)
@@ -106,17 +108,20 @@
 {
 	int i;
 	zap_leaf_t l;
-	l.l_bs = highbit(size)-1;
-	l.l_phys = buf;
+	dmu_buf_t l_dbuf;
 
-	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
-	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
-	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
-	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
-	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
-	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
-	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
+	l_dbuf.db_data = buf;
+	l.l_bs = highbit64(size) - 1;
+	l.l_dbuf = &l_dbuf;
 
+	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
+	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
+	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
+	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
+	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
+	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
+	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
+
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
@@ -158,19 +163,21 @@
 {
 	int i;
 
-	l->l_bs = highbit(l->l_dbuf->db_size)-1;
-	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	    sizeof (struct zap_leaf_header));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
-	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
@@ -182,15 +189,16 @@
 {
 	int chunk;
 
-	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	chunk = l->l_phys->l_hdr.lh_freelist;
+	chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
-	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+	zap_leaf_phys(l)->l_hdr.lh_freelist =
+	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
-	l->l_phys->l_hdr.lh_nfree--;
+	zap_leaf_phys(l)->l_hdr.lh_nfree--;
 
 	return (chunk);
 }
@@ -199,16 +207,16 @@
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
-	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-	l->l_phys->l_hdr.lh_freelist = chunk;
+	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
-	l->l_phys->l_hdr.lh_nfree++;
+	zap_leaf_phys(l)->l_hdr.lh_nfree++;
 }
 
 /*
@@ -394,7 +402,7 @@
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 again:
 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
@@ -414,7 +422,7 @@
 		 * lowest-cd match for MT_FIRST.
 		 */
 		ASSERT(zn->zn_matchtype == MT_EXACT ||
-		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
 		    le->le_name_numints)) {
 			zeh->zeh_num_integers = le->le_value_numints;
@@ -454,10 +462,10 @@
 	uint16_t lh;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (chunk = l->l_phys->l_hash[lh];
+		for (chunk = zap_leaf_phys(l)->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
@@ -528,7 +536,7 @@
 
 int
 zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf)
+    uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
 	int delta_chunks;
 	zap_leaf_t *l = zeh->zeh_leaf;
@@ -537,7 +545,7 @@
 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
-	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
 		return (SET_ERROR(EAGAIN));
 
 	zap_leaf_array_free(l, &le->le_value_chunk);
@@ -567,7 +575,7 @@
 	*zeh->zeh_chunkp = le->le_next;
 	zap_leaf_chunk_free(l, entry_chunk);
 
-	l->l_phys->l_hdr.lh_nentries--;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
 }
 
 int
@@ -591,7 +599,7 @@
 
 	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
-		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
 
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
@@ -627,7 +635,7 @@
 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
-	if (l->l_phys->l_hdr.lh_nfree < numchunks)
+	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
 		return (SET_ERROR(EAGAIN));
 
 	/* make the entry */
@@ -648,7 +656,7 @@
 	/* XXX if we did the search above, we could just use that */
 	chunkp = zap_leaf_rehash_entry(l, chunk);
 
-	l->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
@@ -782,8 +790,8 @@
 
 	zap_leaf_chunk_free(l, entry);
 
-	l->l_phys->l_hdr.lh_nentries--;
-	nl->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
+	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
 }
 
 /*
@@ -793,19 +801,22 @@
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int i;
-	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
-	l->l_phys->l_hdr.lh_prefix <<= 1;
-	l->l_phys->l_hdr.lh_prefix_len++;
-	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
-	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
@@ -833,18 +844,18 @@
 {
 	int i, n;
 
-	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-	    l->l_phys->l_hdr.lh_prefix_len;
+	n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
 
 
-	n = l->l_phys->l_hdr.lh_nentries/5;
+	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_with_n5_entries[n]++;
 
 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
 	    (1<<FZAP_BLOCK_SHIFT(zap));
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
@@ -851,7 +862,7 @@
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
-		int chunk = l->l_phys->l_hash[i];
+		int chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zio.h>
@@ -34,19 +36,23 @@
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
+#include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
 
+static int mzap_upgrade(zap_t **zapp,
+    void *tag, dmu_tx_t *tx, zap_flags_t flags);
+
 uint64_t
 zap_getflags(zap_t *zap)
 {
 	if (zap->zap_ismicro)
 		return (0);
-	return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+	return (zap_f_phys(zap)->zap_flags);
 }
 
 int
@@ -381,7 +387,7 @@
 
 	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
 	} else {
 		zap->zap_ismicro = TRUE;
 	}
@@ -391,7 +397,8 @@
 	 * it, because zap_lockdir() checks zap_ismicro without the lock
 	 * held.
 	 */
-	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+	dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
+	winner = dmu_buf_set_user(db, &zap->zap_dbu);
 
 	if (winner != NULL) {
 		rw_exit(&zap->zap_rwlock);
@@ -403,8 +410,8 @@
 	}
 
 	if (zap->zap_ismicro) {
-		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
-		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+		zap->zap_salt = zap_m_phys(zap)->mz_salt;
+		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
@@ -411,7 +418,7 @@
 
 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
-			    &zap->zap_m.zap_phys->mz_chunk[i];
+			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
 				zap_name_t *zn;
 
@@ -428,8 +435,8 @@
 			}
 		}
 	} else {
-		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+		zap->zap_salt = zap_f_phys(zap)->zap_salt;
+		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
@@ -439,7 +446,7 @@
 		 * other members.
 		 */
 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-		    &zap->zap_f.zap_phys->zap_salt);
+		    &zap_f_phys(zap)->zap_salt);
 
 		/*
 		 * The embedded pointer table should end at the end of
@@ -447,7 +454,7 @@
 		 */
 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-		    (uintptr_t)zap->zap_f.zap_phys, ==,
+		    (uintptr_t)zap_f_phys(zap), ==,
 		    zap->zap_dbuf->db_size);
 	}
 	rw_exit(&zap->zap_rwlock);
@@ -454,21 +461,19 @@
 	return (zap);
 }
 
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
-	dmu_buf_t *db;
 	krw_t lt;
-	int err;
 
+	ASSERT0(db->db_offset);
+	objset_t *os = dmu_buf_get_objset(db);
+	uint64_t obj = db->db_object;
+
 	*zapp = NULL;
 
-	err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
-	if (err)
-		return (err);
-
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
@@ -515,10 +520,12 @@
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			return (mzap_upgrade(zapp, tx, 0));
+			int err = mzap_upgrade(zapp, tag, tx, 0);
+			if (err != 0)
+				rw_exit(&zap->zap_rwlock);
+			return (err);
 		}
-		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
-		ASSERT0(err);
+		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 	}
@@ -527,15 +534,49 @@
 	return (0);
 }
 
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		return (err);
+	}
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+	}
+	return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
+		dmu_buf_rele(db, tag);
+	return (err);
+}
+
 void
-zap_unlockdir(zap_t *zap)
+zap_unlockdir(zap_t *zap, void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf, NULL);
+	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks;
@@ -545,7 +586,7 @@
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	sz = zap->zap_dbuf->db_size;
-	mzp = kmem_alloc(sz, KM_SLEEP);
+	mzp = zio_buf_alloc(sz);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
@@ -553,7 +594,7 @@
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err) {
-			kmem_free(mzp, sz);
+			zio_buf_free(mzp, sz);
 			return (err);
 		}
 	}
@@ -573,18 +614,19 @@
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
-		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+		    tag, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
 		if (err)
 			break;
 	}
-	kmem_free(mzp, sz);
+	zio_buf_free(mzp, sz);
 	*zapp = zap;
 	return (err);
 }
 
-static void
+void
 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
     dmu_tx_t *tx)
 {
@@ -612,9 +654,9 @@
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, &zap));
-		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
-		zap_unlockdir(zap);
+		    B_FALSE, B_FALSE, FTAG, &zap));
+		VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+		zap_unlockdir(zap, FTAG);
 	}
 }
 
@@ -665,9 +707,9 @@
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-	    leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+	    leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
 	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-	    indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+	    indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
 	VERIFY(dmu_object_set_blocksize(os, obj,
 	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -688,11 +730,10 @@
 	return (dmu_object_free(os, zapobj, tx));
 }
 
-_NOTE(ARGSUSED(0))
 void
-zap_evict(dmu_buf_t *db, void *vzap)
+zap_evict(void *dbu)
 {
-	zap_t *zap = vzap;
+	zap_t *zap = dbu;
 
 	rw_destroy(&zap->zap_rwlock);
 
@@ -710,7 +751,7 @@
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
@@ -718,7 +759,7 @@
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -775,25 +816,19 @@
 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
 }
 
-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
-	zap_t *zap;
-	int err;
+	int err = 0;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
-		return (err);
 	zn = zap_name_alloc(zap, name, mt);
-	if (zn == NULL) {
-		zap_unlockdir(zap);
+	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
-	}
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
@@ -820,11 +855,55 @@
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
 	return (err);
 }
 
 int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
@@ -832,18 +911,18 @@
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -855,12 +934,12 @@
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
@@ -867,7 +946,7 @@
 	err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -874,8 +953,8 @@
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
-	int err = (zap_lookup_norm(os, zapobj, name, 0,
-	    0, NULL, MT_EXACT, NULL, 0, NULL));
+	int err = zap_lookup_norm(os, zapobj, name, 0,
+	    0, NULL, MT_EXACT, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
@@ -890,12 +969,12 @@
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -912,7 +991,7 @@
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -924,17 +1003,17 @@
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -950,7 +1029,7 @@
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
@@ -961,7 +1040,7 @@
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
@@ -993,22 +1072,24 @@
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN) {
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
-			err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
+			err = fzap_add(zn, integer_size, num_integers, val,
+			    FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		mze = mze_find(zn);
@@ -1021,7 +1102,7 @@
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1034,19 +1115,19 @@
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1070,25 +1151,27 @@
 		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
 #endif
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val, tx);
+		err = fzap_update(zn, integer_size, num_integers, val,
+		    FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
 			err = fzap_update(zn, integer_size, num_integers,
-			    val, tx);
+			    val, FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		mze = mze_find(zn);
@@ -1102,7 +1185,7 @@
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1115,19 +1198,19 @@
 	zap_name_t *zn;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1146,12 +1229,12 @@
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
@@ -1162,13 +1245,13 @@
 			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
-			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
 			    sizeof (mzap_ent_phys_t));
 			mze_remove(zap, mze);
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1180,17 +1263,17 @@
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1222,7 +1305,7 @@
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap);
+		zap_unlockdir(zc->zc_zap, NULL);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
@@ -1269,7 +1352,7 @@
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
 		if (err)
 			return (err);
 
@@ -1336,7 +1419,7 @@
 
 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
@@ -1373,7 +1456,7 @@
 	int err;
 	zap_t *zap;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 
@@ -1386,31 +1469,31 @@
 	} else {
 		fzap_get_stats(zap, zs);
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (0);
 }
 
 int
-zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
-    uint64_t *towrite, uint64_t *tooverwrite)
+zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
+    refcount_t *towrite, refcount_t *tooverwrite)
 {
 	zap_t *zap;
 	int err = 0;
 
-
 	/*
 	 * Since, we don't have a name, we cannot figure out which blocks will
 	 * be affected in this operation. So, account for the worst case :
 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 	 * - 4 new blocks written if adding:
-	 * 	- 2 blocks for possibly split leaves,
-	 * 	- 2 grown ptrtbl blocks
+	 *    - 2 blocks for possibly split leaves,
+	 *    - 2 grown ptrtbl blocks
 	 *
-	 * This also accomodates the case where an add operation to a fairly
+	 * This also accommodates the case where an add operation to a fairly
 	 * large microzap results in a promotion to fatzap.
 	 */
 	if (name == NULL) {
-		*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		(void) refcount_add_many(towrite,
+		    (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
 		return (err);
 	}
 
@@ -1418,10 +1501,11 @@
 	 * We lock the zap with adding == FALSE. Because, if we pass
 	 * the actual value of add, it could trigger a mzap_upgrade().
 	 * At present we are just evaluating the possibility of this operation
-	 * and hence we donot want to trigger an upgrade.
+	 * and hence we do not want to trigger an upgrade.
 	 */
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
 		return (err);
 
 	if (!zap->zap_ismicro) {
@@ -1434,7 +1518,8 @@
 			/*
 			 * We treat this case as similar to (name == NULL)
 			 */
-			*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+			(void) refcount_add_many(towrite,
+			    (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
 		}
 	} else {
 		/*
@@ -1452,16 +1537,20 @@
 		 * 4 new blocks written : 2 new split leaf, 2 grown
 		 *			ptrtbl blocks
 		 */
-		if (dmu_buf_freeable(zap->zap_dbuf))
-			*tooverwrite += SPA_MAXBLOCKSIZE;
-		else
-			*towrite += SPA_MAXBLOCKSIZE;
+		if (dmu_buf_freeable(zap->zap_dbuf)) {
+			(void) refcount_add_many(tooverwrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		} else {
+			(void) refcount_add_many(towrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		}
 
 		if (add) {
-			*towrite += 4 * SPA_MAXBLOCKSIZE;
+			(void) refcount_add_many(towrite,
+			    4 * MZAP_MAX_BLKSZ, FTAG);
 		}
 	}
 
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -162,23 +162,25 @@
  */
 
 typedef enum {
-	FEATURE_ACTION_ENABLE,
 	FEATURE_ACTION_INCR,
 	FEATURE_ACTION_DECR,
 } feature_action_t;
 
 /*
- * Checks that the features active in the specified object are supported by
+ * Checks that the active features in the pool are supported by
  * this software.  Adds each unsupported feature (name -> description) to
  * the supplied nvlist.
  */
 boolean_t
-feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
+spa_features_check(spa_t *spa, boolean_t for_write,
     nvlist_t *unsup_feat, nvlist_t *enabled_feat)
 {
+	objset_t *os = spa->spa_meta_objset;
 	boolean_t supported;
 	zap_cursor_t zc;
 	zap_attribute_t za;
+	uint64_t obj = for_write ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	supported = B_TRUE;
 	for (zap_cursor_init(&zc, os, obj);
@@ -200,8 +202,8 @@
 				char *desc = "";
 				char buf[MAXPATHLEN];
 
-				if (zap_lookup(os, desc_obj, za.za_name,
-				    1, sizeof (buf), buf) == 0)
+				if (zap_lookup(os, spa->spa_feat_desc_obj,
+				    za.za_name, 1, sizeof (buf), buf) == 0)
 					desc = buf;
 
 				VERIFY(nvlist_add_string(unsup_feat, za.za_name,
@@ -214,13 +216,38 @@
 	return (supported);
 }
 
-static int
-feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
-    zfeature_info_t *feature, uint64_t *res)
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
 {
+	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+	    SPA_FEATURE_DISABLED) {
+		return (SET_ERROR(ENOTSUP));
+	}
+	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
+	return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+    uint64_t *res)
+{
 	int err;
 	uint64_t refcount;
-	uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	/*
 	 * If the pool is currently being created, the feature objects may not
@@ -229,8 +256,8 @@
 	if (zapobj == 0)
 		return (SET_ERROR(ENOTSUP));
 
-	err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
-	    &refcount);
+	err = zap_lookup(spa->spa_meta_objset, zapobj,
+	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
 	if (err != 0) {
 		if (err == ENOENT)
 			return (SET_ERROR(ENOTSUP));
@@ -241,49 +268,139 @@
 	return (0);
 }
 
+
 static int
-feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
-    uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+	uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
+
+	ASSERT(zfeature_depends_on(feature->fi_feature,
+	    SPA_FEATURE_ENABLED_TXG));
+
+	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	ASSERT(enabled_txg_obj != 0);
+
+	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+	    feature->fi_guid, sizeof (uint64_t), 1, res));
+
+	return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
     dmu_tx_t *tx)
 {
-	int error;
-	uint64_t refcount;
-	uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
+	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+	    sizeof (uint64_t), 1, &refcount, tx));
+
+	/*
+	 * feature_sync is called directly from zhack, allowing the
+	 * creation of arbitrary features whose fi_feature field may
+	 * be greater than SPA_FEATURES. When called from zhack, the
+	 * zfeature_info_t object's fi_feature field will be set to
+	 * SPA_FEATURE_NONE.
+	 */
+	if (feature->fi_feature != SPA_FEATURE_NONE) {
+		uint64_t *refcount_cache =
+		    &spa->spa_feat_refcount_cache[feature->fi_feature];
+#ifdef atomic_swap_64
+		VERIFY3U(*refcount_cache, ==,
+		    atomic_swap_64(refcount_cache, refcount));
+#else
+		*refcount_cache = refcount;
+#endif
+	}
+
+	if (refcount == 0)
+		spa_deactivate_mos_feature(spa, feature->fi_guid);
+	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+		spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	uint64_t initial_refcount =
+	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
 	ASSERT(0 != zapobj);
 	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 
-	error = zap_lookup(os, zapobj, feature->fi_guid,
-	    sizeof (uint64_t), 1, &refcount);
-
 	/*
-	 * If we can't ascertain the status of the specified feature, an I/O
-	 * error occurred.
+	 * If the feature is already enabled, ignore the request.
 	 */
-	if (error != 0 && error != ENOENT)
-		return (error);
+	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+		return;
 
+	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+		spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+	    feature->fi_desc, tx));
+
+	feature_sync(spa, feature, initial_refcount, tx);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+		uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+			spa->spa_feat_enabled_txg_obj =
+			    zap_create_link(spa->spa_meta_objset,
+			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
+		}
+		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+		    sizeof (uint64_t), 1, &enabling_txg, tx));
+	}
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+    dmu_tx_t *tx)
+{
+	uint64_t refcount;
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
 	switch (action) {
-	case FEATURE_ACTION_ENABLE:
-		/*
-		 * If the feature is already enabled, ignore the request.
-		 */
-		if (error == 0)
-			return (0);
-		refcount = 0;
-		break;
 	case FEATURE_ACTION_INCR:
-		if (error == ENOENT)
-			return (SET_ERROR(ENOTSUP));
-		if (refcount == UINT64_MAX)
-			return (SET_ERROR(EOVERFLOW));
+		VERIFY3U(refcount, !=, UINT64_MAX);
 		refcount++;
 		break;
 	case FEATURE_ACTION_DECR:
-		if (error == ENOENT)
-			return (SET_ERROR(ENOTSUP));
-		if (refcount == 0)
-			return (SET_ERROR(EOVERFLOW));
+		VERIFY3U(refcount, !=, 0);
 		refcount--;
 		break;
 	default:
@@ -291,42 +408,7 @@
 		break;
 	}
 
-	if (action == FEATURE_ACTION_ENABLE) {
-		int i;
-
-		for (i = 0; feature->fi_depends[i] != NULL; i++) {
-			zfeature_info_t *dep = feature->fi_depends[i];
-
-			error = feature_do_action(os, read_obj, write_obj,
-			    desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
-			if (error != 0)
-				return (error);
-		}
-	}
-
-	error = zap_update(os, zapobj, feature->fi_guid,
-	    sizeof (uint64_t), 1, &refcount, tx);
-	if (error != 0)
-		return (error);
-
-	if (action == FEATURE_ACTION_ENABLE) {
-		error = zap_update(os, desc_obj,
-		    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
-		    feature->fi_desc, tx);
-		if (error != 0)
-			return (error);
-	}
-
-	if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
-		spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
-	}
-
-	if (action == FEATURE_ACTION_DECR && refcount == 0) {
-		spa_deactivate_mos_feature(dmu_objset_spa(os),
-		    feature->fi_guid);
-	}
-
-	return (0);
+	feature_sync(spa, feature, refcount, tx);
 }
 
 void
@@ -354,72 +436,75 @@
  * Enable any required dependencies, then enable the requested feature.
  */
 void
-spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
-	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
-	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
+	ASSERT(VALID_FEATURE_FID(fid));
+	feature_enable_sync(spa, &spa_feature_table[fid], tx);
 }
 
-/*
- * If the specified feature has not yet been enabled, this function returns
- * ENOTSUP; otherwise, this function increments the feature's refcount (or
- * returns EOVERFLOW if the refcount cannot be incremented). This function must
- * be called from syncing context.
- */
 void
-spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
-	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
-	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
-	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
+	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
 }
 
-/*
- * If the specified feature has not yet been enabled, this function returns
- * ENOTSUP; otherwise, this function decrements the feature's refcount (or
- * returns EOVERFLOW if the refcount is already 0). This function must
- * be called from syncing context.
- */
 void
-spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
-	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-	VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
-	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
-	    spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
+	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
 }
 
 boolean_t
-spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
 {
 	int err;
 	uint64_t refcount;
 
+	ASSERT(VALID_FEATURE_FID(fid));
 	if (spa_version(spa) < SPA_VERSION_FEATURES)
 		return (B_FALSE);
 
-	err = feature_get_refcount(spa->spa_meta_objset,
-	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
-	    feature, &refcount);
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
 	ASSERT(err == 0 || err == ENOTSUP);
 	return (err == 0);
 }
 
 boolean_t
-spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
 {
 	int err;
 	uint64_t refcount;
 
+	ASSERT(VALID_FEATURE_FID(fid));
 	if (spa_version(spa) < SPA_VERSION_FEATURES)
 		return (B_FALSE);
 
-	err = feature_get_refcount(spa->spa_meta_objset,
-	    spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
-	    feature, &refcount);
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
 	ASSERT(err == 0 || err == ENOTSUP);
 	return (err == 0 && refcount > 0);
 }
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+	int err;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+	ASSERT(err == 0 || err == ENOTSUP);
+
+	return (err == 0);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1059,8 +1059,7 @@
  * create a new acl and leave any cached acl in place.
  */
 static int
-zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
-    boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
 	zfs_acl_t	*aclp;
 	int		aclsize;
@@ -1069,9 +1068,9 @@
 	zfs_acl_phys_t	znode_acl;
 	int		version;
 	int		error;
-	boolean_t	drop_lock = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 
 	if (zp->z_acl_cached && !will_modify) {
 		*aclpp = zp->z_acl_cached;
@@ -1078,17 +1077,6 @@
 		return (0);
 	}
 
-	/*
-	 * close race where znode could be upgrade while trying to
-	 * read the znode attributes.
-	 *
-	 * But this could only happen if the file isn't already an SA
-	 * znode
-	 */
-	if (!zp->z_is_sa && !have_lock) {
-		mutex_enter(&zp->z_lock);
-		drop_lock = B_TRUE;
-	}
 	version = zfs_znode_acl_version(zp);
 
 	if ((error = zfs_acl_znode_info(zp, &aclsize,
@@ -1134,8 +1122,6 @@
 	if (!will_modify)
 		zp->z_acl_cached = aclp;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
 	return (error);
 }
 
@@ -1162,10 +1148,10 @@
 	int error;
 	zfs_acl_t *aclp;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+	if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
 		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
 		    &zp->z_pflags, zp->z_uid, zp->z_gid);
 	return (error);
@@ -1189,6 +1175,7 @@
 	sa_bulk_attr_t		bulk[5];
 	uint64_t		ctime[2];
 	int			count = 0;
+	zfs_acl_phys_t		acl_phys;
 
 	mode = zp->z_mode;
 
@@ -1235,7 +1222,6 @@
 	} else { /* Painful legacy way */
 		zfs_acl_node_t *aclnode;
 		uint64_t off = 0;
-		zfs_acl_phys_t acl_phys;
 		uint64_t aoid;
 
 		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
@@ -1446,11 +1432,11 @@
 	int error = 0;
 
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
 		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
 	else
-		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+		error = zfs_acl_node_read(zp, aclp, B_TRUE);
 
 	if (error == 0) {
 		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
@@ -1457,7 +1443,6 @@
 		zfs_acl_chmod(ZTOV(zp)->v_type, mode,
 		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
 	}
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -1628,6 +1613,10 @@
 	boolean_t	need_chmod = B_TRUE;
 	boolean_t	inherited = B_FALSE;
 
+	if ((flag & IS_ROOT_NODE) == 0)
+		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	else
+		ASSERT(dzp->z_vnode == NULL);
 	bzero(acl_ids, sizeof (zfs_acl_ids_t));
 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -1684,7 +1673,7 @@
 			} else {
 				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
 				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
-#ifdef __FreeBSD__
+#ifdef __FreeBSD_kernel__
 				gid = acl_ids->z_fgid = dzp->z_gid;
 #else
 				gid = crgetgid(cr);
@@ -1711,12 +1700,10 @@
 
 	if (acl_ids->z_aclp == NULL) {
 		mutex_enter(&dzp->z_acl_lock);
-		mutex_enter(&dzp->z_lock);
 		if (!(flag & IS_ROOT_NODE) &&
 		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
 		    !(dzp->z_pflags & ZFS_XATTR)) {
-			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
-			    &paclp, B_FALSE));
+			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
 			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
 			inherited = B_TRUE;
@@ -1725,7 +1712,6 @@
 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
 			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
 		}
-		mutex_exit(&dzp->z_lock);
 		mutex_exit(&dzp->z_acl_lock);
 		if (need_chmod) {
 			acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
@@ -1791,7 +1777,8 @@
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
@@ -1939,6 +1926,7 @@
 	boolean_t	fuid_dirtied;
 	uint64_t	acl_obj;
 
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (mask == 0)
 		return (SET_ERROR(ENOSYS));
 
@@ -1963,7 +1951,6 @@
 	}
 top:
 	mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
@@ -1996,7 +1983,6 @@
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		mutex_exit(&zp->z_acl_lock);
-		mutex_exit(&zp->z_lock);
 
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
@@ -2021,8 +2007,6 @@
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
-done:
-	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 
 	return (error);
@@ -2054,7 +2038,7 @@
 		return (SET_ERROR(EPERM));
 	}
 
-#ifdef sun
+#ifdef illumos
 	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
 	    (zp->z_pflags & ZFS_NOUNLINK)) {
 		return (SET_ERROR(EPERM));
@@ -2126,7 +2110,8 @@
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
@@ -2375,7 +2360,7 @@
 
 	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
 
-#ifdef __FreeBSD__
+#ifdef __FreeBSD_kernel__
 	/*
 	 * In FreeBSD, we don't care about permissions of individual ADS.
 	 * Note that not checking them is not just an optimization - without

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
@@ -70,136 +71,253 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
-#include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
-#include <sys/sunddi.h>
+#include <sys/zap.h>
 
 #include "zfs_namecheck.h"
 
-typedef struct zfsctl_node {
-	gfs_dir_t	zc_gfs_private;
-	uint64_t	zc_id;
-	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
-} zfsctl_node_t;
+/* Common access mode for all virtual directories under the ctldir */
+const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+    S_IROTH | S_IXOTH;
 
-typedef struct zfsctl_snapdir {
-	zfsctl_node_t	sd_node;
-	kmutex_t	sd_lock;
-	avl_tree_t	sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * "Synthetic" filesystem implementation.
+ */
 
-typedef struct {
-	char		*se_name;
-	vnode_t		*se_root;
-	avl_node_t	se_node;
-} zfs_snapentry_t;
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
 
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t	sn_parent_id;
+	uint64_t	sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artifical
+ * IDs, znode IDs) may clash.
+ */
 static int
-snapentry_compare(const void *a, const void *b)
+sfs_compare_ids(struct vnode *vp, void *arg)
 {
-	const zfs_snapentry_t *sa = a;
-	const zfs_snapentry_t *sb = b;
-	int ret = strcmp(sa->se_name, sb->se_name);
+	sfs_node_t *n1 = vp->v_data;
+	sfs_node_t *n2 = arg;
+	bool equal;
 
-	if (ret < 0)
-		return (-1);
-	else if (ret > 0)
-		return (1);
-	else
-		return (0);
+	equal = n1->sn_id == n2->sn_id &&
+	    n1->sn_parent_id == n2->sn_parent_id;
+
+	/* Zero means equality. */
+	return (!equal);
 }
 
-#ifdef sun
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-vnodeops_t *zfsctl_ops_shares;
-vnodeops_t *zfsctl_ops_shares_dir;
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+   uint64_t id, struct vnode **vpp)
+{
+	sfs_node_t search;
+	int err;
 
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-static const fs_operation_def_t zfsctl_tops_shares[];
-#else	/* !sun */
-static struct vop_vector zfsctl_ops_root;
-static struct vop_vector zfsctl_ops_snapdir;
-static struct vop_vector zfsctl_ops_snapshot;
-static struct vop_vector zfsctl_ops_shares;
-static struct vop_vector zfsctl_ops_shares_dir;
-#endif	/* !sun */
+	search.sn_id = id;
+	search.sn_parent_id = parent_id;
+	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
+	    sfs_compare_ids, &search);
+	return (err);
+}
 
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_mknode_shares(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+   uint64_t id, struct vnode **vpp)
+{
+	int err;
 
-#ifdef sun
-static gfs_opsvec_t zfsctl_opsvec[] = {
-	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
-	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
-	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
-	{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
-	{ ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
-	{ NULL }
-};
-#endif	/* sun */
+	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
+	    sfs_compare_ids, vp->v_data);
+	return (err);
+}
 
-/*
- * Root directory elements.  We only have two entries
- * snapshot and shares.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
-	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
-	{ NULL }
-};
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+	vfs_hash_remove(vp);
+}
 
-/* include . and .. in the calculation */
-#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
 
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+    const char *tag, struct vop_vector *vops,
+    sfs_vnode_setup_fn setup, void *arg,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	int error;
 
+	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(tag, mp, vops, &vp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	/*
+	 * Exclusively lock the vnode vnode while it's being constructed.
+	 */
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+	error = insmntque(vp, mp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	setup(vp, arg);
+
+	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	*vpp = vp;
+	return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+	printf("\tname = %s\n", node->sn_name);
+	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+	struct sfs_node *node;
+
+	KASSERT(strlen(name) < sizeof(node->sn_name),
+	    ("sfs node name is too long"));
+	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
+	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+	strlcpy(node->sn_name, name, sizeof(node->sn_name));
+	node->sn_parent_id = parent_id;
+	node->sn_id = id;
+
+	return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+	free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+	sfs_node_t *node;
+	void *data;
+
+	sfs_vnode_remove(vp);
+	data = vp->v_data;
+	vp->v_data = NULL;
+	return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+    uio_t *uio, off_t *offp)
+{
+	struct dirent entry;
+	int error;
+
+	/* Reset ncookies for subsequent use of vfs_read_dirent. */
+	if (ap->a_ncookies != NULL)
+		*ap->a_ncookies = 0;
+
+	if (uio->uio_resid < sizeof(entry))
+		return (SET_ERROR(EINVAL));
+
+	if (uio->uio_offset < 0)
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == 0) {
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '\0';
+		entry.d_namlen = 1;
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (uio->uio_offset < sizeof(entry))
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == sizeof(entry)) {
+		entry.d_fileno = parent_id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '.';
+		entry.d_name[2] = '\0';
+		entry.d_namlen = 2;
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (offp != NULL)
+		*offp = 2 * sizeof(entry);
+	return (0);
+}
+
+
 /*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
  */
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
 void
 zfsctl_init(void)
 {
-#ifdef sun
-	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
-#endif
 }
 
 void
 zfsctl_fini(void)
 {
-#ifdef sun
-	/*
-	 * Remove vfsctl vnode ops
-	 */
-	if (zfsctl_ops_root)
-		vn_freevnodeops(zfsctl_ops_root);
-	if (zfsctl_ops_snapdir)
-		vn_freevnodeops(zfsctl_ops_snapdir);
-	if (zfsctl_ops_snapshot)
-		vn_freevnodeops(zfsctl_ops_snapshot);
-	if (zfsctl_ops_shares)
-		vn_freevnodeops(zfsctl_ops_shares);
-	if (zfsctl_ops_shares_dir)
-		vn_freevnodeops(zfsctl_ops_shares_dir);
-
-	zfsctl_ops_root = NULL;
-	zfsctl_ops_snapdir = NULL;
-	zfsctl_ops_snapshot = NULL;
-	zfsctl_ops_shares = NULL;
-	zfsctl_ops_shares_dir = NULL;
-#endif	/* sun */
 }
 
 boolean_t
@@ -208,96 +326,119 @@
 	return (vn_matchops(vp, zfsctl_ops_root) ||
 	    vn_matchops(vp, zfsctl_ops_snapdir) ||
 	    vn_matchops(vp, zfsctl_ops_snapshot) ||
-	    vn_matchops(vp, zfsctl_ops_shares) ||
 	    vn_matchops(vp, zfsctl_ops_shares_dir));
 
 }
 
-/*
- * Return the inode number associated with the 'snapshot' or
- * 'shares' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
-{
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+typedef struct zfsctl_root {
+	sfs_node_t	node;
+	sfs_node_t	*snapdir;
+	timestruc_t	cmtime;
+} zfsctl_root_t;
 
-	ASSERT(index <= 2);
 
-	if (index == 0)
-		return (ZFSCTL_INO_SNAPDIR);
-
-	return (zfsvfs->z_shares_dir);
-}
-
 /*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Create the '.zfs' directory.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
-	vnode_t *vp, *rvp;
-	zfsctl_node_t *zcp;
+	zfsctl_root_t *dot_zfs;
+	sfs_node_t *snapdir;
+	vnode_t *rvp;
 	uint64_t crtime[2];
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
-	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = ZFSCTL_INO_ROOT;
+	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+	    ZFSCTL_INO_SNAPDIR);
+	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
+	    ZFSCTL_INO_ROOT);
+	dot_zfs->snapdir = snapdir;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
 	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
-	    &crtime, sizeof (crtime)));
-	ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
-	VN_URELE(rvp);
+	    &crtime, sizeof(crtime)));
+	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+	vput(rvp);
 
-	/*
-	 * We're only faking the fact that we have a root of a filesystem for
-	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
-	 * for us.
-	 */
-	vp->v_vflag &= ~VV_ROOT;
-
-	zfsvfs->z_ctldir = vp;
-
-	VOP_UNLOCK(vp, 0);
+	zfsvfs->z_ctldir = dot_zfs;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
-	VN_RELE(zfsvfs->z_ctldir);
+	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* We support shared locking. */
+	VN_LOCK_ASHARE(vp);
+	vp->v_type = VDIR;
+	vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
+	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+	    zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
+	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
-vnode_t *
-zfsctl_root(znode_t *zp)
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
 {
-	ASSERT(zfs_has_ctldir(zp));
-	VN_HOLD(zp->z_zfsvfs->z_ctldir);
-	return (zp->z_zfsvfs->z_ctldir);
+	vnode_t *vp;
+	int error;
+
+	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+	return (error);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
-/* ARGSUSED */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
@@ -322,7 +463,6 @@
 /*
  * Common access routine.  Disallow writes.
  */
-/* ARGSUSED */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
@@ -334,18 +474,8 @@
 {
 	accmode_t accmode = ap->a_accmode;
 
-#ifdef TODO
-	if (flags & V_ACE_MASK) {
-		if (accmode & ACE_ALL_WRITE_PERMS)
-			return (SET_ERROR(EACCES));
-	} else {
-#endif
-		if (accmode & VWRITE)
-			return (SET_ERROR(EACCES));
-#ifdef TODO
-	}
-#endif
-
+	if (accmode & VWRITE)
+		return (SET_ERROR(EACCES));
 	return (0);
 }
 
@@ -356,7 +486,10 @@
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	timestruc_t	now;
+	sfs_node_t *node;
 
+	node = vp->v_data;
+
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
@@ -368,8 +501,7 @@
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
-	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-	    S_IROTH | S_IXOTH;
+	vap->va_mode = zfsctl_ctldir_mode;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
@@ -378,9 +510,13 @@
 	vap->va_atime = now;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
+
+	vap->va_nodeid = node->sn_id;
+
+	/* At least '.' and '..'. */
+	vap->va_nlink = 2;
 }
 
-/*ARGSUSED*/
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
@@ -390,107 +526,50 @@
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t	*zcp = vp->v_data;
-	uint64_t	object = zcp->zc_id;
+	sfs_node_t	*node = vp->v_data;
+	uint64_t	object = node->sn_id;
 	zfid_short_t	*zfid;
 	int		i;
 
-	ZFS_ENTER(zfsvfs);
-
-#ifdef illumos
-	if (fidp->fid_len < SHORT_FID_LEN) {
-		fidp->fid_len = SHORT_FID_LEN;
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOSPC));
-	}
-#else
-	fidp->fid_len = SHORT_FID_LEN;
-#endif
-
 	zfid = (zfid_short_t *)fidp;
-
 	zfid->zf_len = SHORT_FID_LEN;
 
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
+	for (i = 0; i < sizeof(zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
-	/* .zfs znodes always have a generation number of 0 */
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+	/* .zfs nodes always have a generation number of 0 */
+	for (i = 0; i < sizeof(zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
-	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-
-/*ARGSUSED*/
 static int
-zfsctl_shares_fid(ap)
-	struct vop_fid_args /* {
+zfsctl_common_reclaim(ap)
+	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
-		struct fid *a_fid;
+		struct thread *a_td;
 	} */ *ap;
 {
-	vnode_t		*vp = ap->a_vp;
-	fid_t		*fidp = (void *)ap->a_fid;
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t		*dzp;
-	int		error;
+	vnode_t *vp = ap->a_vp;
 
-	ZFS_ENTER(zfsvfs);
-
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		error = VOP_FID(ZTOV(dzp), fidp);
-		VN_RELE(ZTOV(dzp));
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
+	(void) sfs_reclaim_vnode(vp);
+	return (0);
 }
 
 static int
-zfsctl_common_reclaim(ap)
-	struct vop_reclaim_args /* {
+zfsctl_common_print(ap)
+	struct vop_print_args /* {
 		struct vnode *a_vp;
-		struct thread *a_td;
 	} */ *ap;
 {
-	vnode_t *vp = ap->a_vp;
-
-	/*
-	 * Destroy the vm object and flush associated pages.
-	 */
-	vnode_destroy_vobject(vp);
-	VI_LOCK(vp);
-	vp->v_data = NULL;
-	VI_UNLOCK(vp);
+	sfs_print_node(ap->a_vp->v_data);
 	return (0);
 }
 
 /*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- * 	ENTRY			ZFSCTL_INODE
- * 	.zfs			1
- * 	.zfs/snapshot		2
- * 	.zfs/snapshot/<snap>	objectid(snap)
- */
-
-#define	ZFSCTL_INO_SNAP(id)	(id)
-
-/*
  * Get root directory attributes.
  */
-/* ARGSUSED */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
@@ -501,96 +580,45 @@
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t *zcp = vp->v_data;
+	zfsctl_root_t *node = vp->v_data;
 
-	ZFS_ENTER(zfsvfs);
-	vap->va_nodeid = ZFSCTL_INO_ROOT;
-	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = node->cmtime;
+	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
-
-	zfsctl_common_getattr(vp, vap);
-	ZFS_EXIT(zfsvfs);
-
+	vap->va_nlink += 1; /* snapdir */
+	vap->va_size = vap->va_nlink;
 	return (0);
 }
 
 /*
- * Special case the handling of "..".
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
  */
-/* ARGSUSED */
 int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
 {
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	int err;
+	vref(dvp);
+	if (ltype != VOP_ISLOCKED(dvp)) {
+		if (ltype == LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		else /* if (ltype == LK_SHARED) */
+			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-
-	if (strcmp(nm, "..") == 0) {
-		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
-		if (err == 0)
-			VOP_UNLOCK(*vpp, 0);
-	} else {
-		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
-		    cr, ct, direntflags, realpnp);
+		/* Relock for the "." case may left us with reclaimed vnode. */
+		if ((dvp->v_iflag & VI_DOOMED) != 0) {
+			vrele(dvp);
+			return (SET_ERROR(ENOENT));
+		}
 	}
-
-	ZFS_EXIT(zfsvfs);
-
-	return (err);
+	return (0);
 }
 
-#ifdef sun
-static int
-zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
-{
-	/*
-	 * We only care about ACL_ENABLED so that libsec can
-	 * display ACL correctly and not default to POSIX draft.
-	 */
-	if (cmd == _PC_ACL_ENABLED) {
-		*valp = _ACL_ACE_ENABLED;
-		return (0);
-	}
-
-	return (fs_pathconf(vp, cmd, valp, cr, ct));
-}
-#endif	/* sun */
-
-#ifdef sun
-static const fs_operation_def_t zfsctl_tops_root[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
-	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
-	{ NULL }
-};
-#endif	/* sun */
-
 /*
  * Special case the handling of "..".
  */
-/* ARGSUSED */
 int
-zfsctl_freebsd_root_lookup(ap)
+zfsctl_root_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
@@ -597,329 +625,282 @@
 		struct componentname *a_cnp;
 	} */ *ap;
 {
+	struct componentname *cnp = ap->a_cnp;
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
+	int lkflags = ap->a_cnp->cn_lkflags;
 	int nameiop = ap->a_cnp->cn_nameiop;
-	char nm[NAME_MAX + 1];
 	int err;
+	int ltype;
 
-	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
-		return (EOPNOTSUPP);
+	ASSERT(dvp->v_type == VDIR);
 
-	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
 
-	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
-	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+	} else if ((flags & ISDOTDOT) != 0) {
+		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+		    lkflags, vpp);
+	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+	} else {
+		err = SET_ERROR(ENOENT);
+	}
+	if (err != 0)
+		*vpp = NULL;
 	return (err);
 }
 
-static struct vop_vector zfsctl_ops_root = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
-	.vop_getattr =	zfsctl_root_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	gfs_vop_readdir,
-	.vop_lookup =	zfsctl_freebsd_root_lookup,
-	.vop_inactive =	gfs_vop_inactive,
-	.vop_reclaim =	zfsctl_common_reclaim,
-#ifdef TODO
-	.vop_pathconf =	zfsctl_pathconf,
-#endif
-	.vop_fid =	zfsctl_common_fid,
-};
-
-/*
- * Gets the full dataset name that corresponds to the given snapshot name
- * Example:
- * 	zfsctl_snapshot_zname("snap1") -> "mypool/myfs at snap1"
- */
 static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+zfsctl_root_readdir(ap)
+	struct vop_readdir_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		struct ucred *a_cred;
+		int *a_eofflag;
+		int *ncookies;
+		u_long **a_cookies;
+	} */ *ap;
 {
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_root_t *node = vp->v_data;
+	uio_t *uio = ap->a_uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
+	int error;
 
-	if (snapshot_namecheck(name, NULL, NULL) != 0)
-		return (SET_ERROR(EILSEQ));
-	dmu_objset_name(os, zname);
-	if (strlen(zname) + 1 + strlen(name) >= len)
-		return (SET_ERROR(ENAMETOOLONG));
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
+	ASSERT(vp->v_type == VDIR);
+
+	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
+	}
+	if (uio->uio_offset != dots_offset)
+		return (SET_ERROR(EINVAL));
+
+	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
+	entry.d_fileno = node->snapdir->sn_id;
+	entry.d_type = DT_DIR;
+	strcpy(entry.d_name, node->snapdir->sn_name);
+	entry.d_namlen = strlen(entry.d_name);
+	entry.d_reclen = sizeof(entry);
+	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = 0;
+		return (SET_ERROR(error));
+	}
+	if (eofp != NULL)
+		*eofp = 1;
 	return (0);
 }
 
 static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
 {
-	vnode_t *svp = sep->se_root;
+	static const char dotzfs_name[4] = ".zfs";
+	vnode_t *dvp;
 	int error;
 
-	ASSERT(vn_ismntpt(svp));
+	if (*ap->a_buflen < sizeof (dotzfs_name))
+		return (SET_ERROR(ENOMEM));
 
-	/* this will be dropped by dounmount() */
-	if ((error = vn_vfswlock(svp)) != 0)
-		return (error);
+	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+	    LK_SHARED, &dvp);
+	if (error != 0)
+		return (SET_ERROR(error));
 
-#ifdef sun
-	VN_HOLD(svp);
-	error = dounmount(vn_mountedvfs(svp), fflags, cr);
-	if (error) {
-		VN_RELE(svp);
-		return (error);
-	}
-
-	/*
-	 * We can't use VN_RELE(), as that will try to invoke
-	 * zfsctl_snapdir_inactive(), which would cause us to destroy
-	 * the sd_lock mutex held by our caller.
-	 */
-	ASSERT(svp->v_count == 1);
-	gfs_vop_inactive(svp, cr, NULL);
-
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
-
+	VOP_UNLOCK(dvp, 0);
+	*ap->a_vpp = dvp;
+	*ap->a_buflen -= sizeof (dotzfs_name);
+	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
 	return (0);
-#else	/* !sun */
-	return (dounmount(vn_mountedvfs(svp), fflags, curthread));
-#endif	/* !sun */
 }
 
-#ifdef sun
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+static int
+zfsctl_common_pathconf(ap)
+	struct vop_pathconf_args /* {
+		struct vnode *a_vp;
+		int a_name;
+		int *a_retval;
+	} */ *ap;
 {
-	avl_index_t where;
-	vfs_t *vfsp;
-	refstr_t *pathref;
-	char newpath[MAXNAMELEN];
-	char *tail;
+	/*
+	 * We care about ACL variables so that user land utilities like ls
+	 * can display them correctly.  Since the ctldir's st_dev is set to be
+	 * the same as the parent dataset, we must support all variables that
+	 * it supports.
+	 */
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = INT_MAX;
+		return (0);
 
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
-	ASSERT(sep != NULL);
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 64;
+		return (0);
 
-	vfsp = vn_mountedvfs(sep->se_root);
-	ASSERT(vfsp != NULL);
+	case _PC_MIN_HOLE_SIZE:
+		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
+		return (0);
 
-	vfs_lock_wait(vfsp);
+	case _PC_ACL_EXTENDED:
+		*ap->a_retval = 0;
+		return (0);
 
-	/*
-	 * Change the name in the AVL tree.
-	 */
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-	avl_insert(&sdp->sd_snaps, sep, where);
+	case _PC_ACL_NFS4:
+		*ap->a_retval = 1;
+		return (0);
 
-	/*
-	 * Change the current mountpoint info:
-	 * 	- update the tail of the mntpoint path
-	 *	- update the tail of the resource path
-	 */
-	pathref = vfs_getmntpoint(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '/')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setmntpoint(vfsp, newpath, 0);
+	case _PC_ACL_PATH_MAX:
+		*ap->a_retval = ACL_MAX_ENTRIES;
+		return (0);
 
-	pathref = vfs_getresource(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '@')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setresource(vfsp, newpath, 0);
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		return (0);
 
-	vfs_unlock(vfsp);
+	default:
+		return (vop_stdpathconf(ap));
+	}
 }
-#endif	/* sun */
 
-#ifdef sun
-/*ARGSUSED*/
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr, caller_context_t *ct, int flags)
+/**
+ * Returns a trivial ACL
+ */
+int
+zfsctl_common_getacl(ap)
+	struct vop_getacl_args /* {
+		struct vnode *vp;
+		acl_type_t a_type;
+		struct acl *a_aclp;
+		struct ucred *cred;
+		struct thread *td;
+	} */ *ap;
 {
-	zfsctl_snapdir_t *sdp = sdvp->v_data;
-	zfs_snapentry_t search, *sep;
-	zfsvfs_t *zfsvfs;
-	avl_index_t where;
-	char from[MAXNAMELEN], to[MAXNAMELEN];
-	char real[MAXNAMELEN], fsname[MAXNAMELEN];
-	int err;
+	int i;
 
-	zfsvfs = sdvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
 
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
-		    MAXNAMELEN, NULL);
-		if (err == 0) {
-			snm = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	dmu_objset_name(zfsvfs->z_os, fsname);
-
-	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-	if (err == 0)
-		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-	if (err == 0)
-		err = zfs_secpolicy_rename_perms(from, to, cr);
-	if (err != 0)
-		return (err);
-
+	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
 	/*
-	 * Cannot move snapshots out of the snapdir.
+	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+	 * attributes.  That is not the case for the ctldir, so we must clear
+	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+	 * aren't supported by the ctldir.
 	 */
-	if (sdvp != tdvp)
-		return (SET_ERROR(EINVAL));
+	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+		struct acl_entry *entry;
+		entry = &(ap->a_aclp->acl_entry[i]);
+		uint32_t old_perm = entry->ae_perm;
+		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+		    ACL_READ_NAMED_ATTRS );
+	}
 
-	if (strcmp(snm, tnm) == 0)
-		return (0);
+	return (0);
+}
 
-	mutex_enter(&sdp->sd_lock);
+static struct vop_vector zfsctl_ops_root = {
+	.vop_default =	&default_vnodeops,
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_ioctl =	VOP_EINVAL,
+	.vop_getattr =	zfsctl_root_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_root_readdir,
+	.vop_lookup =	zfsctl_root_lookup,
+	.vop_inactive =	VOP_NULL,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_vptocnp =	zfsctl_root_vptocnp,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
+};
 
-	search.se_name = (char *)snm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-		mutex_exit(&sdp->sd_lock);
-		return (SET_ERROR(ENOENT));
-	}
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
-	err = dsl_dataset_rename_snapshot(fsname, snm, tnm, 0);
-	if (err == 0)
-		zfsctl_rename_snap(sdp, sep, tnm);
-
-	mutex_exit(&sdp->sd_lock);
-
-	return (err);
+	dmu_objset_name(os, zname);
+	if (strlen(zname) + 1 + strlen(name) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+	return (0);
 }
-#endif	/* sun */
 
-#ifdef sun
-/* ARGSUSED */
 static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
 {
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	zfs_snapentry_t *sep;
-	zfs_snapentry_t search;
-	zfsvfs_t *zfsvfs;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 	int err;
 
-	zfsvfs = dvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
-
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-
-		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
-		    MAXNAMELEN, NULL);
-		if (err == 0) {
-			name = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-	if (err == 0)
-		err = zfs_secpolicy_destroy_perms(snapname, cr);
-	if (err != 0)
-		return (err);
-
-	mutex_enter(&sdp->sd_lock);
-
-	search.se_name = name;
-	sep = avl_find(&sdp->sd_snaps, &search, NULL);
-	if (sep) {
-		avl_remove(&sdp->sd_snaps, sep);
-		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
-		if (err != 0)
-			avl_add(&sdp->sd_snaps, sep);
-		else
-			err = dsl_destroy_snapshot(snapname, B_FALSE);
-	} else {
-		err = SET_ERROR(ENOENT);
-	}
-
-	mutex_exit(&sdp->sd_lock);
-
+	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
 	return (err);
 }
-#endif	/* sun */
 
 /*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any.  The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked.  If any error happens the orinal vnode is unlocked and
+ * released.
  */
-/* ARGSUSED */
 static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
-    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+zfsctl_mounted_here(vnode_t **vpp, int flags)
 {
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	char name[MAXNAMELEN];
+	struct mount *mp;
 	int err;
-	static enum symfollow follow = NO_FOLLOW;
-	static enum uio_seg seg = UIO_SYSSPACE;
 
-	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
-		return (SET_ERROR(EILSEQ));
+	ASSERT_VOP_LOCKED(*vpp, __func__);
+	ASSERT3S((*vpp)->v_type, ==, VDIR);
 
-	dmu_objset_name(zfsvfs->z_os, name);
-
-	*vpp = NULL;
-
-	err = zfs_secpolicy_snapshot_perms(name, cr);
-	if (err != 0)
+	if ((mp = (*vpp)->v_mountedhere) != NULL) {
+		err = vfs_busy(mp, 0);
+		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+		vput(*vpp);
+		err = VFS_ROOT(mp, flags, vpp);
+		vfs_unbusy(mp);
 		return (err);
-
-	if (err == 0) {
-		err = dmu_objset_snapshot_one(name, dirname);
-		if (err != 0)
-			return (err);
-		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
 	}
-
-	return (err);
+	return (EJUSTRETURN);
 }
 
-static int
-zfsctl_freebsd_snapdir_mkdir(ap)
-        struct vop_mkdir_args /* {
-                struct vnode *a_dvp;
-                struct vnode **a_vpp;
-                struct componentname *a_cnp;
-                struct vattr *a_vap;
-        } */ *ap;
+typedef struct {
+	const char *snap_name;
+	uint64_t    snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
 {
+	snapshot_setup_arg_t *ssa = arg;
+	sfs_node_t *node;
 
-	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+	ASSERT_VOP_ELOCKED(vp, __func__);
 
-	return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL,
-	    ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL));
+	node = sfs_alloc_node(sizeof(sfs_node_t),
+	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+	zfsctl_common_vnode_setup(vp, node);
+
+	/* We have to support recursive locking. */
+	VN_LOCK_AREC(vp);
 }
 
 /*
@@ -926,8 +907,13 @@
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
  */
-/* ARGSUSED */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
@@ -939,144 +925,102 @@
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
-	char nm[NAME_MAX + 1];
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	objset_t *snap;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
+	char name[NAME_MAX + 1];
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
 	char *mountpoint;
-	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
-	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	uint64_t snap_id;
+	int nameiop = cnp->cn_nameiop;
+	int lkflags = cnp->cn_lkflags;
+	int flags = cnp->cn_flags;
 	int err;
-	int flags = 0;
 
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (SET_ERROR(EINVAL));
-	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
 	ASSERT(dvp->v_type == VDIR);
 
-	*vpp = NULL;
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
 
-	/*
-	 * If we get a recursive call, that means we got called
-	 * from the domount() code while it was trying to look up the
-	 * spec (which looks like a local path for zfs).  We need to
-	 * add some flag to domount() to tell it not to do this lookup.
-	 */
-	if (MUTEX_HELD(&sdp->sd_lock))
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+		return (err);
+	}
+	if (flags & ISDOTDOT) {
+		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+		    vpp);
+		return (err);
+	}
+
+	if (cnp->cn_namelen >= sizeof(name))
+		return (SET_ERROR(ENAMETOOLONG));
+
+	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+	if (err != 0)
 		return (SET_ERROR(ENOENT));
 
-	ZFS_ENTER(zfsvfs);
+	for (;;) {
+		snapshot_setup_arg_t ssa;
 
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
+		ssa.snap_name = name;
+		ssa.snap_id = snap_id;
+		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+		   snap_id, "zfs", &zfsctl_ops_snapshot,
+		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
+		if (err != 0)
+			return (err);
 
-	if (flags & FIGNORECASE) {
-		boolean_t conflict = B_FALSE;
+		/* Check if a new vnode has just been created. */
+		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+			break;
 
-		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
-		    MAXNAMELEN, &conflict);
-		if (err == 0) {
-			strlcpy(nm, real, sizeof(nm));
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
+		/*
+		 * Check if a snapshot is already mounted on top of the vnode.
+		 */
+		err = zfsctl_mounted_here(vpp, lkflags);
+		if (err != EJUSTRETURN)
 			return (err);
-		}
-#if 0
-		if (realpnp)
-			(void) strlcpy(realpnp->pn_buf, nm,
-			    realpnp->pn_bufsize);
-		if (conflict && direntflags)
-			*direntflags = ED_CASE_CONFLICT;
-#endif
-	}
 
-	mutex_enter(&sdp->sd_lock);
-	search.se_name = (char *)nm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-		*vpp = sep->se_root;
-		VN_HOLD(*vpp);
-		err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY);
-		if (err != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-		} else if (*vpp == sep->se_root) {
-			/*
-			 * The snapshot was unmounted behind our backs,
-			 * try to remount it.
+		/*
+		 * If the vnode is not covered, then either the mount operation
+		 * is in progress or the snapshot has already been unmounted
+		 * but the vnode hasn't been inactivated and reclaimed yet.
+		 * We can try to re-use the vnode in the latter case.
+		 */
+		VI_LOCK(*vpp);
+		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+			/* Upgrade to exclusive lock in order to:
+			 * - avoid race conditions
+			 * - satisfy the contract of mount_snapshot()
 			 */
-			VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
-			goto domount;
+			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+			if (err == 0)
+				break;
 		} else {
-			/*
-			 * VROOT was set during the traverse call.  We need
-			 * to clear it since we're pretending to be part
-			 * of our parent's vfs.
-			 */
-			(*vpp)->v_flag &= ~VROOT;
+			VI_UNLOCK(*vpp);
 		}
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
 
-	/*
-	 * The requested snapshot is not currently mounted, look it up.
-	 */
-	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-	if (err != 0) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
 		/*
-		 * handle "ls *" or "?" in a graceful manner,
-		 * forcing EILSEQ to ENOENT.
-		 * Since shell ultimately passes "*" or "?" as name to lookup
+		 * In this state we can loop on uncontested locks and starve
+		 * the thread doing the lengthy, non-trivial mount operation.
+		 * So, yield to prevent that from happening.
 		 */
-		return (err == EILSEQ ? ENOENT : err);
+		vput(*vpp);
+		kern_yield(PRI_USER);
 	}
-	if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
-		mutex_exit(&sdp->sd_lock);
-#ifdef illumos
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOENT));
-#else	/* !illumos */
-		/* Translate errors and add SAVENAME when needed. */
-		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
-			err = EJUSTRETURN;
-			cnp->cn_flags |= SAVENAME;
-		} else {
-			err = SET_ERROR(ENOENT);
-		}
-		ZFS_EXIT(zfsvfs);
-		return (err);
-#endif	/* !illumos */
-	}
 
-	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-	VN_HOLD(*vpp);
-	avl_insert(&sdp->sd_snaps, sep, where);
+	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
 
-	dmu_objset_rele(snap, FTAG);
-domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
-	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
+	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len,
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
-	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
-	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0);
+	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
@@ -1088,228 +1032,84 @@
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+		/* Clear the root flag (set via VFS_ROOT) as well. */
+		(*vpp)->v_vflag &= ~VV_ROOT;
 	}
-	mutex_exit(&sdp->sd_lock);
-	ZFS_EXIT(zfsvfs);
 
-#ifdef illumos
-	/*
-	 * If we had an error, drop our hold on the vnode and
-	 * zfsctl_snapshot_inactive() will clean up.
-	 */
-	if (err != 0) {
-		VN_RELE(*vpp);
-		*vpp = NULL;
-	}
-#else
 	if (err != 0)
 		*vpp = NULL;
-#endif
 	return (err);
 }
 
-/* ARGSUSED */
-int
-zfsctl_shares_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	struct componentname *cnp = ap->a_cnp;
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	char nm[NAME_MAX + 1];
-	znode_t *dzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-
-	ASSERT(cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
-
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
-		error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
-
-	VN_RELE(ZTOV(dzp));
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/* ARGSUSED */
 static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data, int flags)
-{
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	char snapname[MAXNAMELEN];
-	uint64_t id, cookie;
-	boolean_t case_conflict;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-
-	cookie = *offp;
-	dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
-	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-	    &cookie, &case_conflict);
-	dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
-	if (error) {
-		ZFS_EXIT(zfsvfs);
-		if (error == ENOENT) {
-			*eofp = 1;
-			return (0);
-		}
-		return (error);
-	}
-
-	if (flags & V_RDDIR_ENTFLAGS) {
-		edirent_t *eodp = dp;
-
-		(void) strcpy(eodp->ed_name, snapname);
-		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
-		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
-	} else {
-		struct dirent64 *odp = dp;
-
-		(void) strcpy(odp->d_name, snapname);
-		odp->d_ino = ZFSCTL_INO_SNAP(id);
-	}
-	*nextp = cookie;
-
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_shares_readdir(ap)
+zfsctl_snapdir_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
-		int *a_ncookies;
+		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	struct dirent entry;
 	vnode_t *vp = ap->a_vp;
-	uio_t *uiop = ap->a_uio;
-	cred_t *cr = ap->a_cred;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	uio_t *uio = ap->a_uio;
 	int *eofp = ap->a_eofflag;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t *dzp;
+	off_t dots_offset;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ASSERT(vp->v_type == VDIR);
 
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTSUP));
+	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
 	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
-		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies);
-		VN_URELE(ZTOV(dzp));
-	} else {
-		*eofp = 1;
-		error = SET_ERROR(ENOENT);
-	}
 
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
+	ZFS_ENTER(zfsvfs);
+	for (;;) {
+		uint64_t cookie;
+		uint64_t id;
 
-/*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- *
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
- */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
-	vnode_t *vp;
-	zfsctl_snapdir_t *sdp;
+		cookie = uio->uio_offset - dots_offset;
 
-	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
-	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-	    zfsctl_snapdir_readdir_cb, NULL);
-	sdp = vp->v_data;
-	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&sdp->sd_snaps, snapentry_compare,
-	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-	VOP_UNLOCK(vp, 0);
-	return (vp);
-}
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT) {
+				if (eofp != NULL)
+					*eofp = 1;
+				error = 0;
+			}
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
 
-vnode_t *
-zfsctl_mknode_shares(vnode_t *pvp)
-{
-	vnode_t *vp;
-	zfsctl_node_t *sdp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
-	    &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
-	    NULL, NULL);
-	sdp = vp->v_data;
-	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	VOP_UNLOCK(vp, 0);
-	return (vp);
-
-}
-
-/* ARGSUSED */
-static int
-zfsctl_shares_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	vattr_t *vap = ap->a_vap;
-	cred_t *cr = ap->a_cred;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t *dzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTSUP));
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		strcpy(entry.d_name, snapname);
+		entry.d_namlen = strlen(entry.d_name);
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0) {
+			if (error == ENAMETOOLONG)
+				error = 0;
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(error));
+		}
+		uio->uio_offset = cookie + dots_offset;
 	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
-		error = VOP_GETATTR(ZTOV(dzp), vap, cr);
-		VN_URELE(ZTOV(dzp));
-	}
-	ZFS_EXIT(zfsvfs);
-	return (error);
-
-
+	/* NOTREACHED */
 }
 
-/* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
@@ -1321,135 +1121,46 @@
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_snapdir_t *sdp = vp->v_data;
+	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
+	sfs_node_t *node = vp->v_data;
+	uint64_t snap_count;
+	int err;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
-	vap->va_nodeid = gfs_file_inode(vp);
-	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-	vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	zfsctl_snapdir_t *sdp = vp->v_data;
-	zfs_snapentry_t *sep;
-
-	/*
-	 * On forced unmount we have to free snapshots from here.
-	 */
-	mutex_enter(&sdp->sd_lock);
-	while ((sep = avl_first(&sdp->sd_snaps)) != NULL) {
-		avl_remove(&sdp->sd_snaps, sep);
-		kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-		kmem_free(sep, sizeof (zfs_snapentry_t));
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+		if (err != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+		vap->va_nlink += snap_count;
 	}
-	mutex_exit(&sdp->sd_lock);
-	gfs_dir_inactive(vp);
-	ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-	mutex_destroy(&sdp->sd_lock);
-	avl_destroy(&sdp->sd_snaps);
-	kmem_free(sdp, sizeof (zfsctl_snapdir_t));
+	vap->va_size = vap->va_nlink;
 
+	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-#ifdef sun
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
-	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
-	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
-	{ NULL }
-};
-
-static const fs_operation_def_t zfsctl_tops_shares[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_shares_getattr } },
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = zfsctl_shares_readdir } },
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_shares_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive } },
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_shares_fid } },
-	{ NULL }
-};
-#else	/* !sun */
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
-	.vop_mkdir =	zfsctl_freebsd_snapdir_mkdir,
-	.vop_readdir =	gfs_vop_readdir,
+	.vop_readdir =	zfsctl_snapdir_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
-	.vop_inactive =	zfsctl_snapdir_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
 };
 
-static struct vop_vector zfsctl_ops_shares = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
-	.vop_getattr =	zfsctl_shares_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	zfsctl_shares_readdir,
-	.vop_lookup =	zfsctl_shares_lookup,
-	.vop_inactive =	gfs_vop_inactive,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_fid =	zfsctl_shares_fid,
-};
-#endif	/* !sun */
-
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot.  This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
-	vnode_t *vp;
-	zfsctl_node_t *zcp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
-	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-	VN_HOLD(vp);
-	zcp = vp->v_data;
-	zcp->zc_id = objset;
-	VOP_UNLOCK(vp, 0);
-
-	return (vp);
-}
-
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
@@ -1458,180 +1169,76 @@
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
-	cred_t *cr = ap->a_td->td_ucred;
-	struct vop_inactive_args iap;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
-	int locked;
-	vnode_t *dvp;
 
-	if (vp->v_count > 0)
-		goto end;
-
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-	sdp = dvp->v_data;
-	VOP_UNLOCK(dvp, 0);
-
-	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
-		mutex_enter(&sdp->sd_lock);
-
-	ASSERT(!vn_ismntpt(vp));
-
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
-
-		if (sep->se_root == vp) {
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-			break;
-		}
-		sep = next;
-	}
-	ASSERT(sep != NULL);
-
-	if (!locked)
-		mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
-
-end:
-	/*
-	 * Dispose of the vnode for the snapshot mount point.
-	 * This is safe to do because once this entry has been removed
-	 * from the AVL tree, it can't be found again, so cannot become
-	 * "active".  If we lookup the same name again we will end up
-	 * creating a new vnode.
-	 */
-	iap.a_vp = vp;
-	return (gfs_vop_inactive(&iap));
+	VERIFY(vrecycle(vp) == 1);
+	return (0);
 }
 
 static int
-zfsctl_traverse_begin(vnode_t **vpp, int lktype)
-{
-
-	VN_HOLD(*vpp);
-	/* Snapshot should be already mounted, but just in case. */
-	if (vn_mountedvfs(*vpp) == NULL)
-		return (ENOENT);
-	return (traverse(vpp, lktype));
-}
-
-static void
-zfsctl_traverse_end(vnode_t *vp, int err)
-{
-
-	if (err == 0)
-		vput(vp);
-	else
-		VN_RELE(vp);
-}
-
-static int
-zfsctl_snapshot_getattr(ap)
-	struct vop_getattr_args /* {
+zfsctl_snapshot_reclaim(ap)
+	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
+		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
-	int err;
+	void *data = vp->v_data;
 
-	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
-	if (err == 0)
-		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred);
-	zfsctl_traverse_end(vp, err);
-	return (err);
+	sfs_reclaim_vnode(vp);
+	sfs_destroy_node(data);
+	return (0);
 }
 
 static int
-zfsctl_snapshot_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 {
-	vnode_t *vp = ap->a_vp;
-	int err;
-
-	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
-	if (err == 0)
-		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
-	zfsctl_traverse_end(vp, err);
-	return (err);
-}
-
-static int
-zfsctl_snapshot_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	struct componentname *cnp = ap->a_cnp;
-	cred_t *cr = ap->a_cnp->cn_cred;
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	struct mount *mp;
+	vnode_t *dvp;
+	vnode_t *vp;
+	sfs_node_t *node;
+	size_t len;
+	int locked;
 	int error;
 
-	if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' ||
-	    cnp->cn_nameptr[1] != '.') {
-		return (ENOENT);
-	}
+	vp = ap->a_vp;
+	node = vp->v_data;
+	len = strlen(node->sn_name);
+	if (*ap->a_buflen < len)
+		return (SET_ERROR(ENOMEM));
 
-	ASSERT(dvp->v_type == VDIR);
-	ASSERT(zfsvfs->z_ctldir != NULL);
+	/*
+	 * Prevent unmounting of the snapshot while the vnode lock
+	 * is not held.  That is not strictly required, but allows
+	 * us to assert that an uncovered snapshot vnode is never
+	 * "leaked".
+	 */
+	mp = vp->v_mountedhere;
+	if (mp == NULL)
+		return (SET_ERROR(ENOENT));
+	error = vfs_busy(mp, 0);
+	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
 
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp,
-	    NULL, 0, NULL, cr, NULL, NULL, NULL);
-	if (error == 0)
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	return (error);
-}
+	/*
+	 * We can vput the vnode as we can now depend on the reference owned
+	 * by the busied mp.  But we also need to hold the vnode, because
+	 * the reference may go after vfs_unbusy() which has to be called
+	 * before we can lock the vnode again.
+	 */
+	locked = VOP_ISLOCKED(vp);
+	vhold(vp);
+	vput(vp);
 
-static int
-zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
-{
-	zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data;
-	vnode_t *dvp, *vp;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep;
-	int error;
-
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		vp = sep->se_root;
-		if (vp == ap->a_vp)
-			break;
-		sep = AVL_NEXT(&sdp->sd_snaps, sep);
-	}
-	if (sep == NULL) {
-		mutex_exit(&sdp->sd_lock);
-		error = ENOENT;
-	} else {
-		size_t len;
-
-		len = strlen(sep->se_name);
+	/* Look up .zfs/snapshot, our parent. */
+	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+	if (error == 0) {
+		VOP_UNLOCK(dvp, 0);
+		*ap->a_vpp = dvp;
 		*ap->a_buflen -= len;
-		bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len);
-		mutex_exit(&sdp->sd_lock);
-		vref(dvp);
-		*ap->a_vpp = dvp;
+		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
 	}
-	VN_RELE(dvp);
-
+	vfs_unbusy(mp);
+	vget(vp, locked | LK_RETRY, curthread);
+	vdrop(vp);
 	return (error);
 }
 
@@ -1640,71 +1247,40 @@
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
-	.vop_default =	&default_vnodeops,
-	.vop_inactive =	zfsctl_snapshot_inactive,
-	.vop_lookup =	zfsctl_snapshot_lookup,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_getattr =	zfsctl_snapshot_getattr,
-	.vop_fid =	zfsctl_snapshot_fid,
-	.vop_vptocnp =	zfsctl_snapshot_vptocnp,
+	.vop_default =		NULL, /* ensure very restricted access */
+	.vop_inactive =		zfsctl_snapshot_inactive,
+	.vop_reclaim =		zfsctl_snapshot_reclaim,
+	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
+	.vop_lock1 =		vop_stdlock,
+	.vop_unlock =		vop_stdunlock,
+	.vop_islocked =		vop_stdislocked,
+	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
+	.vop_print =		zfsctl_common_print,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
+	struct mount *mp;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *vp;
-	zfsctl_snapdir_t *sdp;
-	zfsctl_node_t *zcp;
-	zfs_snapentry_t *sep;
+	vnode_t *vp;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		vp = sep->se_root;
-		zcp = vp->v_data;
-		if (zcp->zc_id == objsetid)
-			break;
-
-		sep = AVL_NEXT(&sdp->sd_snaps, sep);
-	}
-
-	if (sep != NULL) {
-		VN_HOLD(vp);
+	*zfsvfsp = NULL;
+	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+	if (error == 0 && vp != NULL) {
 		/*
-		 * Return the mounted root rather than the covered mount point.
-		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
-		 * and returns the ZFS vnode mounted on top of the GFS node.
-		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+		 * XXX Probably need to at least reference, if not busy, the mp.
 		 */
-		error = traverse(&vp, LK_SHARED | LK_RETRY);
-		if (error == 0) {
-			if (vp == sep->se_root)
-				error = SET_ERROR(EINVAL);
-			else
-				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
-		}
-		mutex_exit(&sdp->sd_lock);
-		if (error == 0)
-			VN_URELE(vp);
-		else
-			VN_RELE(vp);
-	} else {
-		error = SET_ERROR(EINVAL);
-		mutex_exit(&sdp->sd_lock);
+		if (vp->v_mountedhere != NULL)
+			*zfsvfsp = vp->v_mountedhere->mnt_data;
+		vput(vp);
 	}
-
-	VN_RELE(dvp);
-
-	return (error);
+	if (*zfsvfsp == NULL)
+		return (SET_ERROR(EINVAL));
+	return (0);
 }
 
 /*
@@ -1715,52 +1291,70 @@
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct mount *mp;
 	vnode_t *dvp;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
+	vnode_t *vp;
+	sfs_node_t *node;
+	sfs_node_t *snap;
+	uint64_t cookie;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, cr, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
 
-	mutex_enter(&sdp->sd_lock);
+	cookie = 0;
+	for (;;) {
+		uint64_t id;
 
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT)
+				error = 0;
+			break;
+		}
 
-		/*
-		 * If this snapshot is not mounted, then it must
-		 * have just been unmounted by somebody else, and
-		 * will be cleaned up by zfsctl_snapdir_inactive().
-		 */
-		if (vn_ismntpt(sep->se_root)) {
-			error = zfsctl_unmount_snap(sep, fflags, cr);
-			if (error) {
-				avl_index_t where;
+		for (;;) {
+			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+			    ZFSCTL_INO_SNAPDIR, id, &vp);
+			if (error != 0 || vp == NULL)
+				break;
 
-				/*
-				 * Before reinserting snapshot to the tree,
-				 * check if it was actually removed. For example
-				 * when snapshot mount point is busy, we will
-				 * have an error here, but there will be no need
-				 * to reinsert snapshot.
-				 */
-				if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
-					avl_insert(&sdp->sd_snaps, sep, where);
+			mp = vp->v_mountedhere;
+
+			/*
+			 * v_mountedhere being NULL means that the
+			 * (uncovered) vnode is in a transient state
+			 * (mounting or unmounting), so loop until it
+			 * settles down.
+			 */
+			if (mp != NULL)
 				break;
-			}
+			vput(vp);
 		}
-		sep = next;
+		if (error != 0)
+			break;
+		if (vp == NULL)
+			continue;	/* no mountpoint, nothing to do */
+
+		/*
+		 * The mount-point vnode is kept locked to avoid spurious EBUSY
+		 * from a concurrent umount.
+		 * The vnode lock must have recursive locking enabled.
+		 */
+		vfs_ref(mp);
+		error = dounmount(mp, fflags, curthread);
+		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+		    ("extra references after unmount"));
+		vput(vp);
+		if (error != 0)
+			break;
 	}
-
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
-
+	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+	    ("force unmounting failed"));
 	return (error);
 }
+

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -29,7 +29,7 @@
 list_t zfs_dbgmsgs;
 int zfs_dbgmsg_size;
 kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
 
 void
 zfs_dbgmsg_init(void)
@@ -58,7 +58,10 @@
  * echo ::zfs_dbgmsg | mdb -k
  *
  * Monitor these messages by running:
- * 	dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
  */
 void
 zfs_dbgmsg(const char *fmt, ...)
@@ -95,3 +98,16 @@
 	}
 	mutex_exit(&zfs_dbgmsgs_lock);
 }
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	zfs_dbgmsg_t *zdm;
+
+	(void) printf("ZFS_DBGMSG(%s):\n", tag);
+	mutex_enter(&zfs_dbgmsgs_lock);
+	for (zdm = list_head(&zfs_dbgmsgs); zdm;
+	    zdm = list_next(&zfs_dbgmsgs, zdm))
+		(void) printf("%s\n", zdm->zdm_msg);
+	mutex_exit(&zfs_dbgmsgs_lock);
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -59,51 +59,37 @@
 #include <sys/extdirent.h>
 
 /*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
-    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    boolean_t exact, uint64_t *zoid)
 {
 	int error;
 
 	if (zfsvfs->z_norm) {
-		matchtype_t mt = MT_FIRST;
-		boolean_t conflict = B_FALSE;
-		size_t bufsz = 0;
-		char *buf = NULL;
+		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
 
-		if (rpnp) {
-			buf = rpnp->pn_buf;
-			bufsz = rpnp->pn_bufsize;
-		}
-		if (exact)
-			mt = MT_EXACT;
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
-		    zoid, mt, buf, bufsz, &conflict);
-		if (!error && deflags)
-			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+		    zoid, mt, NULL, 0, NULL);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
-	if (error == ENOENT && update)
-		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
 	return (error);
 }
 
 /*
- * Lock a directory entry.  A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object.  As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
  *
  * Input arguments:
  *	dzp	- znode for directory
@@ -110,45 +96,27 @@
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
- *		  ZCILOOK: On a mixed sensitivity file system,
- *			   this lookup should be case-insensitive.
- *		  ZCIEXACT: On a purely case-insensitive file system,
- *			    this lookup should be case-sensitive.
- *		  ZRENAMING: we are locking for renaming, force narrow locks
- *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
- *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *	dlpp	- pointer to the dirlock for this entry (NULL on error)
- *      direntflags - (case-insensitive lookup only)
- *		flags if multiple case-sensitive matches exist in directory
- *      realpnp     - (case-insensitive lookup only)
- *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- *	 but return znode pointers to a single match.
  */
 int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-    int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	*dl;
-	boolean_t	update;
 	boolean_t	exact;
 	uint64_t	zoid;
 	vnode_t		*vp = NULL;
 	int		error = 0;
-	int		cmpflags;
 
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
 	*zpp = NULL;
-	*dlpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -162,125 +130,20 @@
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
-	 * affect what vnodes can be cached in the DNLC, how we
-	 * perform zap lookups, and the "width" of our dirlocks.
+	 * affect how we perform zap lookups.
 	 *
-	 * A normal dirlock locks a single name.  Note that with
-	 * normalization a name can be composed multiple ways, but
-	 * when normalized, these names all compare equal.  A wide
-	 * dirlock locks multiple names.  We need these when the file
-	 * system is supporting mixed-mode access.  It is sometimes
-	 * necessary to lock all case permutations of file name at
-	 * once so that simultaneous case-insensitive/case-sensitive
-	 * behaves as rationally as possible.
-	 */
-
-	/*
 	 * Decide if exact matches should be requested when performing
 	 * a zap lookup on file systems supporting case-insensitive
 	 * access.
-	 */
-	exact =
-	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
-	/*
-	 * Only look in or update the DNLC if we are looking for the
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name.
 	 *
-	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
-	 * case for performance improvement?
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
 	 */
-	update = !zfsvfs->z_norm ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
-	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
 
-	/*
-	 * ZRENAMING indicates we are in a situation where we should
-	 * take narrow locks regardless of the file system's
-	 * preferences for normalizing and case folding.  This will
-	 * prevent us deadlocking trying to grab the same wide lock
-	 * twice if the two names happen to be case-insensitive
-	 * matches.
-	 */
-	if (flag & ZRENAMING)
-		cmpflags = 0;
-	else
-		cmpflags = zfsvfs->z_norm;
-
-	/*
-	 * Wait until there are no locks on this name.
-	 *
-	 * Don't grab the the lock if it is already held. However, cannot
-	 * have both ZSHARED and ZHAVELOCK together.
-	 */
-	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
-	if (!(flag & ZHAVELOCK))
-		rw_enter(&dzp->z_name_lock, RW_READER);
-
-	mutex_enter(&dzp->z_lock);
-	for (;;) {
-		if (dzp->z_unlinked) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
-			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
-			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
-				break;
-		}
-		if (error != 0) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (SET_ERROR(ENOENT));
-		}
-		if (dl == NULL)	{
-			size_t namesize;
-
-			/*
-			 * Allocate a new dirlock and add it to the list.
-			 */
-			namesize = strlen(name) + 1;
-			dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
-			    KM_SLEEP);
-			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
-			dl->dl_name = (char *)(dl + 1);
-			bcopy(name, dl->dl_name, namesize);
-			dl->dl_sharecnt = 0;
-			dl->dl_namelock = 0;
-			dl->dl_namesize = namesize;
-			dl->dl_dzp = dzp;
-			dl->dl_next = dzp->z_dirlocks;
-			dzp->z_dirlocks = dl;
-			break;
-		}
-		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
-			break;
-		cv_wait(&dl->dl_cv, &dzp->z_lock);
-	}
-
-	/*
-	 * If the z_name_lock was NOT held for this dirlock record it.
-	 */
-	if (flag & ZHAVELOCK)
-		dl->dl_namelock = 1;
-
-	if (flag & ZSHARED)
-		dl->dl_sharecnt++;
-
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * We have a dirlock on the name.  (Note that it is the dirlock,
-	 * not the dzp's z_lock, that protects the name in the zap object.)
-	 * See if there's an object by this name; if so, put a hold on it.
-	 */
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
 	if (flag & ZXATTR) {
 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
 		    sizeof (zoid));
@@ -287,155 +150,73 @@
 		if (error == 0)
 			error = (zoid == 0 ? ENOENT : 0);
 	} else {
-		if (update)
-			vp = dnlc_lookup(ZTOV(dzp), name);
-		if (vp == DNLC_NO_VNODE) {
-			VN_RELE(vp);
-			error = SET_ERROR(ENOENT);
-		} else if (vp) {
-			if (flag & ZNEW) {
-				zfs_dirent_unlock(dl);
-				VN_RELE(vp);
-				return (SET_ERROR(EEXIST));
-			}
-			*dlpp = dl;
-			*zpp = VTOZ(vp);
-			return (0);
-		} else {
-			error = zfs_match_find(zfsvfs, dzp, name, exact,
-			    update, direntflags, realpnp, &zoid);
-		}
+		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
-			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
-			zfs_dirent_unlock(dl);
 			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error) {
-			zfs_dirent_unlock(dl);
+		if (error)
 			return (error);
-		}
-		if (!(flag & ZXATTR) && update)
-			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+		ASSERT(!(*zpp)->z_unlinked);
 	}
 
-	*dlpp = dl;
-
 	return (0);
 }
 
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
 {
-	znode_t *dzp = dl->dl_dzp;
-	zfs_dirlock_t **prev_dl, *cur_dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
 
-	mutex_enter(&dzp->z_lock);
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-	if (!dl->dl_namelock)
-		rw_exit(&dzp->z_name_lock);
+	if (dzp->z_unlinked)
+		return (ENOENT);
 
-	if (dl->dl_sharecnt > 1) {
-		dl->dl_sharecnt--;
-		mutex_exit(&dzp->z_lock);
-		return;
-	}
-	prev_dl = &dzp->z_dirlocks;
-	while ((cur_dl = *prev_dl) != dl)
-		prev_dl = &cur_dl->dl_next;
-	*prev_dl = dl->dl_next;
-	cv_broadcast(&dl->dl_cv);
-	mutex_exit(&dzp->z_lock);
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
 
-	cv_destroy(&dl->dl_cv);
-	kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
 }
 
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- *	no directory entries are actually stored for them.  If this is
- *	the root of a filesystem, then '.zfs' is also treated as a
- *	special pseudo-directory.
- */
 int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
-    int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
 {
-	zfs_dirlock_t *dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	int error = 0;
-	uint64_t parent;
-	int unlinked;
 
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
+
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
-
-		*vpp = ZTOV(dzp);
-		VN_HOLD(*vpp);
+		*zpp = dzp;
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if ((error = sa_lookup(dzp->z_sa_hdl,
-		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
-			return (error);
-		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
-			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred,
-			    NULL, NULL, NULL);
-			return (error);
-		}
-
-		mutex_enter(&dzp->z_lock);
-		unlinked = dzp->z_unlinked;
-		mutex_exit(&dzp->z_lock);
-		if (unlinked)
-			return (ENOENT);
-
-		rw_enter(&dzp->z_parent_lock, RW_READER);
-		error = zfs_zget(zfsvfs, parent, &zp);
-		if (error == 0)
-			*vpp = ZTOV(zp);
-		rw_exit(&dzp->z_parent_lock);
-	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
-		*vpp = zfsctl_root(dzp);
+		error = zfs_dd_lookup(dzp, zpp);
 	} else {
-		int zf;
-
-		zf = ZEXISTS | ZSHARED;
-		if (flags & FIGNORECASE)
-			zf |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
 		if (error == 0) {
-			*vpp = ZTOV(zp);
-			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
 		}
-		rpnp = NULL;
 	}
-
-	if ((flags & FIGNORECASE) && rpnp && !error)
-		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
 	return (error);
 }
 
@@ -511,8 +292,9 @@
 		if (error != 0)
 			continue;
 
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
-		VN_RELE(ZTOV(zp));
+		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
 }
@@ -536,7 +318,6 @@
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	dl;
 	int skipped = 0;
 	int error;
 
@@ -550,6 +331,7 @@
 			continue;
 		}
 
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
 
@@ -560,23 +342,21 @@
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 		/* Is this really needed ? */
 		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			VN_RELE(ZTOV(xzp));
+			vput(ZTOV(xzp));
 			skipped += 1;
 			continue;
 		}
-		bzero(&dl, sizeof (dl));
-		dl.dl_dzp = dzp;
-		dl.dl_name = zap.za_name;
 
-		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
@@ -596,6 +376,7 @@
 	int		error;
 
 	ASSERT(zp->z_links == 0);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	/*
 	 * If this is an attribute directory, purge its contents.
@@ -634,7 +415,8 @@
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT(error == 0);
+		ASSERT3S(error, ==, 0);
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	acl_obj = zfs_external_acl(zp);
@@ -668,12 +450,10 @@
 
 	if (xzp) {
 		ASSERT(error == 0);
-		mutex_enter(&xzp->z_lock);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		xzp->z_links = 0;	/* no more links to it */
 		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &xzp->z_links, sizeof (xzp->z_links), tx));
-		mutex_exit(&xzp->z_lock);
 		zfs_unlinked_add(xzp, tx);
 	}
 
@@ -686,7 +466,7 @@
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 }
 
 static uint64_t
@@ -700,12 +480,12 @@
 }
 
 /*
- * Link zp into dl.  Can only fail if zp has been unlinked.
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
  */
 int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	uint64_t value;
@@ -715,18 +495,32 @@
 	int count = 0;
 	int error;
 
-	mutex_enter(&zp->z_lock);
-
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+	if (zp_is_dir) {
+		error = 0;
+		if (dzp->z_links >= LINK_MAX)
+			error = SET_ERROR(EMLINK);
+		return (error);
+	}
+#endif
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			mutex_exit(&zp->z_lock);
 			return (SET_ERROR(ENOENT));
 		}
+#if 0
+		if (zp->z_links >= LINK_MAX) {
+			return (SET_ERROR(EMLINK));
+		}
+#endif
 		zp->z_links++;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 		    &zp->z_links, sizeof (zp->z_links));
 
+	} else {
+		ASSERT(zp->z_unlinked == 0);
 	}
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &dzp->z_id, sizeof (dzp->z_id));
@@ -740,11 +534,8 @@
 		    ctime, B_TRUE);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
+	ASSERT0(error);
 
-	mutex_exit(&zp->z_lock);
-
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size++;
 	dzp->z_links += zp_is_dir;
 	count = 0;
@@ -760,38 +551,32 @@
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	value = zfs_dirent(zp, zp->z_mode);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
 	    8, 1, &value, tx);
-	ASSERT(error == 0);
+	VERIFY0(error);
 
-	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
-
 	return (0);
 }
 
 static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag)
 {
 	int error;
 
 	if (zp->z_zfsvfs->z_norm) {
-		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
-		    (flag & ZCIEXACT)) ||
-		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
-		    !(flag & ZCILOOK)))
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
+			    dzp->z_id, name, MT_EXACT, tx);
 		else
 			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+			    dzp->z_id, name, MT_FIRST, tx);
 	} else {
 		error = zap_remove(zp->z_zfsvfs->z_os,
-		    dzp->z_id, dl->dl_name, tx);
+		    dzp->z_id, name, tx);
 	}
 
 	return (error);
@@ -798,7 +583,7 @@
 }
 
 /*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
@@ -805,10 +590,9 @@
  * and it's the caller's job to do it.
  */
 int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
-	boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
 {
-	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	int zp_is_dir = (vp->v_type == VDIR);
@@ -818,22 +602,12 @@
 	int count = 0;
 	int error;
 
-	dnlc_remove(ZTOV(dzp), dl->dl_name);
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	if (!(flag & ZRENAMING)) {
-		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
-			return (SET_ERROR(EBUSY));
 
-		if (vn_ismntpt(vp)) {		/* don't remove mount point */
-			vn_vfsunlock(vp);
-			return (SET_ERROR(EBUSY));
-		}
-
-		mutex_enter(&zp->z_lock);
-
 		if (zp_is_dir && !zfs_dirempty(zp)) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 #ifdef illumos
 			return (SET_ERROR(EEXIST));
 #else
@@ -846,10 +620,8 @@
 		 * First try removing the name from the directory; if that
 		 * fails, return the error.
 		 */
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0) {
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
 			return (error);
 		}
 
@@ -876,16 +648,14 @@
 		    NULL, &zp->z_links, sizeof (zp->z_links));
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		count = 0;
-		ASSERT(error == 0);
-		mutex_exit(&zp->z_lock);
-		vn_vfsunlock(vp);
+		ASSERT0(error);
 	} else {
-		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0)
 			return (error);
 	}
 
-	mutex_enter(&dzp->z_lock);
 	dzp->z_size--;		/* one dirent removed */
 	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@@ -900,8 +670,7 @@
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-	mutex_exit(&dzp->z_lock);
+	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
@@ -912,14 +681,12 @@
 }
 
 /*
- * Indicate whether the directory is empty.  Works with or without z_lock
- * held, but can only be consider a hint in the latter case.  Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
-	return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
+	return (dzp->z_size == 2);
 }
 
 int
@@ -939,7 +706,7 @@
 	 * In FreeBSD, access checking for creating an EA is being done
 	 * in zfs_setextattr(),
 	 */
-#ifndef __FreeBSD__
+#ifndef __FreeBSD_kernel__
 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
 		return (error);
 #endif
@@ -952,7 +719,8 @@
 		return (SET_ERROR(EDQUOT));
 	}
 
-top:
+	getnewvnode_reserve(1);
+
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
@@ -961,13 +729,8 @@
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		return (error);
@@ -992,6 +755,8 @@
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
+	getnewvnode_drop_reserve();
+
 	*xvpp = ZTOV(xzp);
 
 	return (0);
@@ -1015,23 +780,20 @@
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	znode_t		*xzp;
-	zfs_dirlock_t	*dl;
 	vattr_t		va;
 	int		error;
 top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xvpp = ZTOV(xzp);
-		zfs_dirent_unlock(dl);
 		return (0);
 	}
 
 
 	if (!(flags & CREATE_XATTR_DIR)) {
-		zfs_dirent_unlock(dl);
 #ifdef illumos
 		return (SET_ERROR(ENOENT));
 #else
@@ -1040,7 +802,6 @@
 	}
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		zfs_dirent_unlock(dl);
 		return (SET_ERROR(EROFS));
 	}
 
@@ -1060,7 +821,6 @@
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-	zfs_dirent_unlock(dl);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -361,8 +361,8 @@
 
 typedef struct zfs_ecksum_info {
 	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
 
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
@@ -387,7 +387,7 @@
 } zfs_ecksum_info_t;
 
 static void
-update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
@@ -553,7 +553,7 @@
 	if (badbuf == NULL || goodbuf == NULL)
 		return (eip);
 
-	ASSERT3U(nui64s, <=, UINT16_MAX);
+	ASSERT3U(nui64s, <=, UINT32_MAX);
 	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, <=, UINT32_MAX);
@@ -655,10 +655,10 @@
 	} else {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT16_ARRAY,
+		    DATA_TYPE_UINT32_ARRAY,
 		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT16_ARRAY,
+		    DATA_TYPE_UINT32_ARRAY,
 		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
 		    NULL);
 	}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -407,7 +407,7 @@
 	domain = zfs_fuid_find_by_idx(zfsvfs, index);
 	ASSERT(domain != NULL);
 
-#ifdef sun
+#ifdef illumos
 	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
 		(void) kidmap_getuidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
@@ -415,9 +415,9 @@
 		(void) kidmap_getgidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
 	}
-#else	/* !sun */
+#else
 	id = UID_NOBODY;
-#endif	/* !sun */
+#endif
 	return (id);
 }
 
@@ -704,13 +704,13 @@
 boolean_t
 zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 {
-#ifdef sun
+#ifdef illumos
 	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
 	ksidlist_t	*ksidlist = crgetsidlist(cr);
-#endif	/* !sun */
+#endif
 	uid_t		gid;
 
-#ifdef sun
+#ifdef illumos
 	if (ksid && ksidlist) {
 		int 		i;
 		ksid_t		*ksid_groups;
@@ -742,7 +742,7 @@
 			}
 		}
 	}
-#endif	/* !sun */
+#endif	/* illumos */
 
 	/*
 	 * Not found in ksidlist, check posix groups

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,14 +22,16 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel at dawidek.net>.
- * All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
  * Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2014 Xin Li <delphij at FreeBSD.org>. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
@@ -132,6 +134,9 @@
  *         distinguish between the operation failing, and
  *         deserialization failing.
  */
+#ifdef __FreeBSD__
+#include "opt_kstack_pages.h"
+#endif
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -182,8 +187,10 @@
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -191,14 +198,7 @@
 #include "zfs_comutil.h"
 #include "zfs_ioctl_compat.h"
 
-CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
 
-static int snapshot_list_prefetch;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.snapshot_list_prefetch", &snapshot_list_prefetch);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, snapshot_list_prefetch, CTLFLAG_RW,
-    &snapshot_list_prefetch, 0, "Prefetch data when listing snapshots");
-
 static struct cdev *zfsdev;
 
 extern void zfs_init(void);
@@ -207,6 +207,7 @@
 uint_t zfs_fsyncer_key;
 extern uint_t rrw_tsd_key;
 static uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
 
 typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
 typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
@@ -255,7 +256,7 @@
  
 static void zfsdev_close(void *data);
 
-static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature);
+static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
@@ -499,6 +500,14 @@
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp;
 
+	/*
+	 * First do a quick check for root in the global zone, which
+	 * is allowed to do all write_perms.  This ensures that zfs_ioc_*
+	 * will get to handle nonexistent datasets.
+	 */
+	if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
+		return (0);
+
 	error = dsl_pool_hold(name, FTAG, &dp);
 	if (error != 0)
 		return (error);
@@ -634,12 +643,14 @@
 		break;
 
 	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
-			char setpoint[MAXNAMELEN];
+			char setpoint[ZFS_MAX_DATASET_NAME_LEN];
 			/*
 			 * Unprivileged users are allowed to modify the
-			 * quota on things *under* (ie. contained by)
+			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
 			if (dsl_prop_get_integer(dsname, "jailed", &zoned,
@@ -848,22 +859,9 @@
 		return (SET_ERROR(EINVAL));
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nextpair) {
-		dsl_pool_t *dp;
-		dsl_dataset_t *ds;
-
-		error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp);
-		if (error != 0)
-			break;
 		nextpair = nvlist_next_nvpair(snaps, pair);
-		error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds);
-		if (error == 0)
-			dsl_dataset_rele(ds, FTAG);
-		dsl_pool_rele(dp, FTAG);
-
-		if (error == 0) {
-			error = zfs_secpolicy_destroy_perms(nvpair_name(pair),
-			    cr);
-		} else if (error == ENOENT) {
+		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+		if (error == ENOENT) {
 			/*
 			 * Ignore any snapshots that don't exist (we consider
 			 * them "already destroyed").  Remove the name from the
@@ -885,7 +883,7 @@
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
-	char	parentname[MAXNAMELEN];
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
@@ -957,13 +955,13 @@
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
-		char parentname[MAXNAMELEN];
+		char parentname[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dataset_t *origin = NULL;
 		dsl_dir_t *dd;
 		dd = clone->ds_dir;
 
 		error = dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &origin);
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
 		if (error != 0) {
 			dsl_dataset_rele(clone, FTAG);
 			dsl_pool_rele(dp, FTAG);
@@ -1041,8 +1039,77 @@
 	return (error);
 }
 
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
 /* ARGSUSED */
 static int
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int error = 0;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
+
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_BOOKMARK, cr);
+		*hashp = '#';
+		if (error != 0)
+			break;
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvpair_t *pair, *nextpair;
+	int error = 0;
+
+	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+	    pair = nextpair) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
+		nextpair = nvlist_next_nvpair(innvl, pair);
+
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_DESTROY, cr);
+		*hashp = '#';
+		if (error == ENOENT) {
+			/*
+			 * Ignore any filesystems that don't exist (we consider
+			 * their bookmarks "already destroyed").  Remove
+			 * the name from the nvl here in case the filesystem
+			 * is created between now and when we try to destroy
+			 * the bookmark (in which case we don't want to
+			 * destroy it since we haven't checked for permission).
+			 */
+			fnvlist_remove_nvpair(innvl, pair);
+			error = 0;
+		}
+		if (error != 0)
+			break;
+	}
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
 zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
@@ -1057,7 +1124,7 @@
 static int
 zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	char	parentname[MAXNAMELEN];
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
 	int	error;
 	char	*origin;
 
@@ -1200,7 +1267,7 @@
 
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
-		char fsname[MAXNAMELEN];
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
@@ -1221,7 +1288,7 @@
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(innvl, pair)) {
-		char fsname[MAXNAMELEN];
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
@@ -1281,7 +1348,7 @@
 	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
 	    iflag)) != 0) {
 		kmem_free(packed, size);
-		return (error);
+		return (SET_ERROR(EFAULT));
 	}
 
 	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
@@ -1370,6 +1437,7 @@
 getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
+	vfs_t *vfsp;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
@@ -1383,12 +1451,21 @@
 	mutex_enter(&os->os_user_ptr_lock);
 	*zfvp = dmu_objset_get_user(os);
 	if (*zfvp) {
-		VFS_HOLD((*zfvp)->z_vfs);
+		vfsp = (*zfvp)->z_vfs;
+		vfs_ref(vfsp);
 	} else {
 		error = SET_ERROR(ESRCH);
 	}
 	mutex_exit(&os->os_user_ptr_lock);
 	dmu_objset_rele(os, FTAG);
+	if (error == 0) {
+		error = vfs_busy(vfsp, 0);
+		vfs_rel(vfsp);
+		if (error != 0) {
+			*zfvp = NULL;
+			error = SET_ERROR(ESRCH);
+		}
+	}
 	return (error);
 }
 
@@ -1406,8 +1483,9 @@
 	if (getzfsvfs(name, zfvp) != 0)
 		error = zfsvfs_create(name, zfvp);
 	if (error == 0) {
-		rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+		rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
 		    RW_READER, tag);
+#ifdef illumos
 		if ((*zfvp)->z_unmounted) {
 			/*
 			 * XXX we could probably try again, since the unmounting
@@ -1414,9 +1492,16 @@
 			 * thread should be just about to disassociate the
 			 * objset from the zfsvfs.
 			 */
-			rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+			rrm_exit(&(*zfvp)->z_teardown_lock, tag);
 			return (SET_ERROR(EBUSY));
 		}
+#else
+		/*
+		 * vfs_busy() ensures that the filesystem is not and
+		 * can not be unmounted.
+		 */
+		ASSERT(!(*zfvp)->z_unmounted);
+#endif
 	}
 	return (error);
 }
@@ -1424,10 +1509,14 @@
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
 {
-	rrw_exit(&zfsvfs->z_teardown_lock, tag);
+	rrm_exit(&zfsvfs->z_teardown_lock, tag);
 
 	if (zfsvfs->z_vfs) {
+#ifdef illumos
 		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
 	} else {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
@@ -1542,8 +1631,7 @@
 
 	nvlist_free(config);
 
-	if (props)
-		nvlist_free(props);
+	nvlist_free(props);
 
 	return (error);
 }
@@ -1829,6 +1917,7 @@
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares);
 
+#ifdef illumos
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool.
@@ -1844,6 +1933,7 @@
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EDOM));
 	}
+#endif /* illumos */
 
 	if (error == 0) {
 		error = spa_vdev_add(spa, config);
@@ -2278,7 +2368,8 @@
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
-	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    ZFS_MAX_DATASET_NAME_LEN) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(ESRCH));
 	}
@@ -2372,7 +2463,7 @@
 	const char *propname = nvpair_name(pair);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
-	int err;
+	int err = -1;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_userquota(propname))
@@ -2399,6 +2490,21 @@
 	case ZFS_PROP_REFQUOTA:
 		err = dsl_dataset_set_refquota(dsname, source, intval);
 		break;
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		if (intval == UINT64_MAX) {
+			/* clearing the limit, just do it */
+			err = 0;
+		} else {
+			err = dsl_dir_activate_fs_ss_limit(dsname);
+		}
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
+		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
@@ -2406,8 +2512,7 @@
 		err = dsl_dataset_set_refreservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_VOLSIZE:
-		err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
-		    intval);
+		err = zvol_set_volsize(dsname, intval);
 		break;
 	case ZFS_PROP_VERSION:
 	{
@@ -2429,38 +2534,6 @@
 		}
 		break;
 	}
-	case ZFS_PROP_COMPRESSION:
-	{
-		if (intval == ZIO_COMPRESS_LZ4) {
-			zfeature_info_t *feature =
-			    &spa_feature_table[SPA_FEATURE_LZ4_COMPRESS];
-			spa_t *spa;
-
-			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
-				return (err);
-
-			/*
-			 * Setting the LZ4 compression algorithm activates
-			 * the feature.
-			 */
-			if (!spa_feature_is_active(spa, feature)) {
-				if ((err = zfs_prop_activate_feature(spa,
-				    feature)) != 0) {
-					spa_close(spa, FTAG);
-					return (err);
-				}
-			}
-
-			spa_close(spa, FTAG);
-		}
-		/*
-		 * We still want the default set action to be performed in the
-		 * caller, we only performed zfeature settings here.
-		 */
-		err = -1;
-		break;
-	}
-
 	default:
 		err = -1;
 	}
@@ -2637,7 +2710,6 @@
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
-		char *valstr;
 
 		if (!zfs_prop_user(propname) ||
 		    nvpair_type(pair) != DATA_TYPE_STRING)
@@ -2650,8 +2722,7 @@
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 
-		VERIFY(nvpair_value_string(pair, &valstr) == 0);
-		if (strlen(valstr) >= ZAP_MAXVALUELEN)
+		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
 			return (E2BIG);
 	}
 	return (0);
@@ -2977,7 +3048,7 @@
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
 		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
-			VFS_HOLD(vfsp);
+			vfs_ref(vfsp);
 			break;
 		}
 	}
@@ -3100,7 +3171,7 @@
 	boolean_t fuids_ok, sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	objset_t *os = NULL;
-	char parentname[MAXNAMELEN];
+	char parentname[ZFS_MAX_DATASET_NAME_LEN];
 	char *cp;
 	spa_t *spa;
 	uint64_t spa_vers;
@@ -3297,6 +3368,10 @@
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
+#ifdef __FreeBSD__
+	if (error == 0)
+		zvol_create_minors(fsname);
+#endif
 	return (error);
 }
 
@@ -3336,7 +3411,8 @@
 		 * The snap name must contain an @, and the part after it must
 		 * contain only valid characters.
 		 */
-		if (cp == NULL || snapshot_namecheck(cp + 1, NULL, NULL) != 0)
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
@@ -3401,6 +3477,53 @@
 	return (error);
 }
 
+#ifdef __FreeBSD__
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	char name[MAXNAMELEN];
+	spa_t *spa;
+	vdev_t *vd;
+	char *command;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	int error;
+
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_string(innvl,
+	    "command", &command) != 0)
+		return (EINVAL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_by_guid(pool_guid, vdev_guid);
+	if (spa != NULL)
+		strcpy(name, spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+	if (spa == NULL)
+		return (ENOENT);
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+	if (vd == NULL) {
+		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+	error = vdev_label_write_pad2(vd, command, strlen(command));
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	spa_close(spa, FTAG);
+	return (error);
+}
+#endif
+
 /*
  * The dp_config_rwlock must not be held when calling this, because the
  * unmount may need to write out data.
@@ -3416,7 +3539,9 @@
 {
 	vfs_t *vfsp;
 	zfsvfs_t *zfsvfs;
+#ifdef illumos
 	int err;
+#endif
 
 	if (strchr(snapname, '@') == NULL)
 		return (0);
@@ -3428,21 +3553,20 @@
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
 
+#ifdef illumos
 	err = vn_vfswlock(vfsp->vfs_vnodecovered);
 	VFS_RELE(vfsp);
 	if (err != 0)
 		return (SET_ERROR(err));
+#endif
 
 	/*
 	 * Always force the unmount for snapshots.
 	 */
-
 #ifdef illumos
 	(void) dounmount(vfsp, MS_FORCE, kcred);
 #else
-	mtx_lock(&Giant);	/* dounmount() */
 	(void) dounmount(vfsp, MS_FORCE, curthread);
-	mtx_unlock(&Giant);	/* dounmount() */
 #endif
 	return (0);
 }
@@ -3471,7 +3595,7 @@
 		return;
 	ds = dmu_objset_ds(os);
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
-		char originname[MAXNAMELEN];
+		char originname[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dataset_name(ds->ds_prev, originname);
 		dmu_objset_rele(os, FTAG);
 		(void) zfs_unmount_snap(originname);
@@ -3489,6 +3613,7 @@
  * outnvl: snapshot -> error code (int32)
  *
  */
+/* ARGSUSED */
 static int
 zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
@@ -3507,7 +3632,8 @@
 		const char *name = nvpair_name(pair);
 
 		/*
-		 * The snap must be in the specified pool.
+		 * The snap must be in the specified pool to prevent the
+		 * invalid removal of zvol minors below.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
@@ -3516,7 +3642,9 @@
 		error = zfs_unmount_snap(name);
 		if (error != 0)
 			return (error);
-		(void) zvol_remove_minor(name);
+#if defined(__FreeBSD__)
+		zvol_remove_minors(name);
+#endif
 	}
 
 	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
@@ -3523,6 +3651,100 @@
 }
 
 /*
+ * Create bookmarks.  Bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks must be in the same pool.
+ *
+ * innvl: {
+ *     bookmark1 -> snapshot1, bookmark2 -> snapshot2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *snap_name;
+
+		/*
+		 * Verify the snapshot argument.
+		 */
+		if (nvpair_value_string(pair, &snap_name) != 0)
+			return (SET_ERROR(EINVAL));
+
+
+		/* Verify that the keys (bookmarks) are unique */
+		for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
+		    pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
+			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+				return (SET_ERROR(EINVAL));
+		}
+	}
+
+	return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ *     property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ *     bookmark name 1 -> { property 1, property 2, ... },
+ *     bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ *     bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	int error, poollen;
+
+	poollen = strlen(poolname);
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		const char *name = nvpair_name(pair);
+		const char *cp = strchr(name, '#');
+
+		/*
+		 * The bookmark name must contain an #, and the part after it
+		 * must contain only valid characters.
+		 */
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The bookmark must be in the specified pool.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '#'))
+			return (SET_ERROR(EXDEV));
+	}
+
+	error = dsl_bookmark_destroy(innvl, outnvl);
+	return (error);
+}
+
+/*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_objset_type	type of objset
@@ -3546,34 +3768,48 @@
 	else
 		err = dsl_destroy_head(zc->zc_name);
 	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+#ifdef __FreeBSD__
+		zvol_remove_minors(zc->zc_name);
+#else
 		(void) zvol_remove_minor(zc->zc_name);
+#endif
 	return (err);
 }
 
 /*
- * inputs:
- * zc_name	name of dataset to rollback (to most recent snapshot)
+ * fsname is name of dataset to rollback (to most recent snapshot)
  *
- * outputs:	none
+ * innvl is not used.
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
+zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
-	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+	if (getzfsvfs(fsname, &zfsvfs) == 0) {
+		dsl_dataset_t *ds;
+
+		ds = dmu_objset_ds(zfsvfs->z_os);
 		error = zfs_suspend_fs(zfsvfs);
 		if (error == 0) {
 			int resume_err;
 
-			error = dsl_dataset_rollback(zc->zc_name);
-			resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
+			error = dsl_dataset_rollback(fsname, zfsvfs, outnvl);
+			resume_err = zfs_resume_fs(zfsvfs, ds);
 			error = error ? error : resume_err;
 		}
+#ifdef illumos
 		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
 	} else {
-		error = dsl_dataset_rollback(zc->zc_name);
+		error = dsl_dataset_rollback(fsname, NULL, outnvl);
 	}
 	return (error);
 }
@@ -3582,7 +3818,7 @@
 recursive_unmount(const char *fsname, void *arg)
 {
 	const char *snapname = arg;
-	char fullname[MAXNAMELEN];
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
 
 	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
 	return (zfs_unmount_snap(fullname));
@@ -3600,10 +3836,12 @@
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
 	boolean_t recursive = zc->zc_cookie & 1;
+	char *at;
+	boolean_t allow_mounted = B_TRUE;
+
 #ifdef __FreeBSD__
-	boolean_t allow_mounted = zc->zc_cookie & 2;
+	allow_mounted = (zc->zc_cookie & 2) != 0;
 #endif
-	char *at;
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
@@ -3613,22 +3851,25 @@
 	at = strchr(zc->zc_name, '@');
 	if (at != NULL) {
 		/* snaps must be in same fs */
+		int error;
+
 		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
 			return (SET_ERROR(EXDEV));
 		*at = '\0';
-#ifdef illumos
-		if (zc->zc_objset_type == DMU_OST_ZFS) {
-#else
-		if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) {
-#endif
-			int error = dmu_objset_find(zc->zc_name,
+		if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) {
+			error = dmu_objset_find(zc->zc_name,
 			    recursive_unmount, at + 1,
 			    recursive ? DS_FIND_CHILDREN : 0);
-			if (error != 0)
+			if (error != 0) {
+				*at = '@';
 				return (error);
+			}
 		}
-		return (dsl_dataset_rename_snapshot(zc->zc_name,
-		    at + 1, strchr(zc->zc_value, '@') + 1, recursive));
+		error = dsl_dataset_rename_snapshot(zc->zc_name,
+		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
+		*at = '@';
+
+		return (error);
 	} else {
 #ifdef illumos
 		if (zc->zc_objset_type == DMU_OST_ZVOL)
@@ -3705,8 +3946,7 @@
 		 * the SPA supports it. We ignore any errors here since
 		 * we'll catch them later.
 		 */
-		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-		    nvpair_value_uint64(pair, &intval) == 0) {
+		if (nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9 &&
 			    zfs_earlier_version(dsname,
@@ -3720,15 +3960,13 @@
 				return (SET_ERROR(ENOTSUP));
 
 			if (intval == ZIO_COMPRESS_LZ4) {
-				zfeature_info_t *feature =
-				    &spa_feature_table[
-				    SPA_FEATURE_LZ4_COMPRESS];
 				spa_t *spa;
 
 				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 					return (err);
 
-				if (!spa_feature_is_enabled(spa, feature)) {
+				if (!spa_feature_is_enabled(spa,
+				    SPA_FEATURE_LZ4_COMPRESS)) {
 					spa_close(spa, FTAG);
 					return (SET_ERROR(ENOTSUP));
 				}
@@ -3754,9 +3992,40 @@
 			return (SET_ERROR(ENOTSUP));
 		break;
 
-	case ZFS_PROP_DEDUP:
-		if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
-			return (SET_ERROR(ENOTSUP));
+	case ZFS_PROP_RECORDSIZE:
+		/* Record sizes above 128k need the feature to be enabled */
+		if (nvpair_value_uint64(pair, &intval) == 0 &&
+		    intval > SPA_OLD_MAXBLOCKSIZE) {
+			spa_t *spa;
+
+			/*
+			 * If this is a bootable dataset then
+			 * the we don't allow large (>128K) blocks,
+			 * because GRUB doesn't support them.
+			 */
+			if (zfs_is_bootfs(dsname) &&
+			    intval > SPA_OLD_MAXBLOCKSIZE) {
+				return (SET_ERROR(ERANGE));
+			}
+
+			/*
+			 * We don't allow setting the property above 1MB,
+			 * unless the tunable has been changed.
+			 */
+			if (intval > zfs_max_recordsize ||
+			    intval > SPA_MAXBLOCKSIZE)
+				return (SET_ERROR(ERANGE));
+
+			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+				return (err);
+
+			if (!spa_feature_is_enabled(spa,
+			    SPA_FEATURE_LARGE_BLOCKS)) {
+				spa_close(spa, FTAG);
+				return (SET_ERROR(ENOTSUP));
+			}
+			spa_close(spa, FTAG);
+		}
 		break;
 
 	case ZFS_PROP_SHARESMB:
@@ -3773,7 +4042,46 @@
 				return (SET_ERROR(ENOTSUP));
 		}
 		break;
+
+	case ZFS_PROP_CHECKSUM:
+	case ZFS_PROP_DEDUP:
+	{
+		spa_feature_t feature;
+		spa_t *spa;
+
+		/* dedup feature version checks */
+		if (prop == ZFS_PROP_DEDUP &&
+		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+			return (SET_ERROR(ENOTSUP));
+
+		if (nvpair_value_uint64(pair, &intval) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/* check prop value is enabled in features */
+		feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
+		if (feature == SPA_FEATURE_NONE)
+			break;
+
+		if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+			return (err);
+		/*
+		 * Salted checksums are not supported on root pools.
+		 */
+		if (spa_bootfs(spa) != 0 &&
+		    intval < ZIO_CHECKSUM_FUNCTIONS &&
+		    (zio_checksum_table[intval].ci_flags &
+		    ZCHECKSUM_FLAG_SALTED)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ERANGE));
+		}
+		if (!spa_feature_is_enabled(spa, feature)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ENOTSUP));
+		}
+		spa_close(spa, FTAG);
+		break;
 	}
+	}
 
 	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
@@ -3786,9 +4094,9 @@
 zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	zfeature_info_t *feature = arg;
+	spa_feature_t *featurep = arg;
 
-	if (!spa_feature_is_active(spa, feature))
+	if (!spa_feature_is_active(spa, *featurep))
 		return (0);
 	else
 		return (SET_ERROR(EBUSY));
@@ -3802,9 +4110,9 @@
 zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	zfeature_info_t *feature = arg;
+	spa_feature_t *featurep = arg;
 
-	spa_feature_incr(spa, feature, tx);
+	spa_feature_incr(spa, *featurep, tx);
 }
 
 /*
@@ -3813,7 +4121,7 @@
  * as being active.
  */
 static int
-zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature)
+zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
 {
 	int err;
 
@@ -3820,7 +4128,7 @@
 	/* EBUSY here indicates that the feature is already active */
 	err = dsl_sync_task(spa_name(spa),
 	    zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
-	    feature, 2);
+	    &feature, 2, ZFS_SPACE_CHECK_RESERVED);
 
 	if (err != 0 && err != EBUSY)
 		return (err);
@@ -3956,6 +4264,56 @@
 	}
 }
 
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota.  We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+	nvlist_t *delayprops;
+	nvpair_t *nvp, *tmp;
+	static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
+	int i;
+
+	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+	    nvp = nvlist_next_nvpair(props, nvp)) {
+		/*
+		 * strcmp() is safe because zfs_prop_to_name() always returns
+		 * a bounded string.
+		 */
+		for (i = 0; delayable[i] != 0; i++) {
+			if (strcmp(zfs_prop_to_name(delayable[i]),
+			    nvpair_name(nvp)) == 0) {
+				break;
+			}
+		}
+		if (delayable[i] != 0) {
+			tmp = nvlist_prev_nvpair(props, nvp);
+			VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+			VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+			nvp = tmp;
+		}
+	}
+
+	if (nvlist_empty(delayprops)) {
+		nvlist_free(delayprops);
+		delayprops = NULL;
+	}
+	return (delayprops);
+}
+
 #ifdef	DEBUG
 static boolean_t zfs_ioc_recv_inject_err;
 #endif
@@ -3971,6 +4329,7 @@
  * zc_guid		force flag
  * zc_cleanup_fd	cleanup-on-exit file descriptor
  * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable		if data is incomplete assume sender will resume
  *
  * outputs:
  * zc_cookie		number of bytes read
@@ -3991,9 +4350,11 @@
 	offset_t off;
 	nvlist_t *props = NULL; /* sent properties */
 	nvlist_t *origprops = NULL; /* existing properties */
+	nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
 	char *origin = NULL;
 	char *tosnap;
-	char tofs[ZFS_MAXNAMELEN];
+	char tofs[ZFS_MAX_DATASET_NAME_LEN];
+	cap_rights_t rights;
 	boolean_t first_recvd_props = B_FALSE;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
@@ -4011,19 +4372,23 @@
 		return (error);
 
 	fd = zc->zc_cookie;
+#ifdef illumos
 	fp = getf(fd);
+#else
+	fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
+#endif
 	if (fp == NULL) {
 		nvlist_free(props);
 		return (SET_ERROR(EBADF));
 	}
 
-	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	errors = fnvlist_alloc();
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
 
 	error = dmu_recv_begin(tofs, tosnap,
-	    &zc->zc_begin_record, force, origin, &drc);
+	    &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
 	if (error != 0)
 		goto out;
 
@@ -4071,21 +4436,12 @@
 		props_error = dsl_prop_set_hasrecvd(tofs);
 
 		if (props_error == 0) {
+			delayprops = extract_delay_props(props);
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    props, errors);
 		}
 	}
 
-	if (zc->zc_nvlist_dst_size != 0 &&
-	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
-	    put_nvlist(zc, errors) != 0)) {
-		/*
-		 * Caller made zc->zc_nvlist_dst less than the minimum expected
-		 * size or supplied an invalid address.
-		 */
-		props_error = SET_ERROR(EINVAL);
-	}
-
 	off = fp->f_offset;
 	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
 	    &zc->zc_action_handle);
@@ -4095,23 +4451,63 @@
 
 		if (getzfsvfs(tofs, &zfsvfs) == 0) {
 			/* online recv */
+			dsl_dataset_t *ds;
 			int end_err;
 
+			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
 			/*
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
-			end_err = dmu_recv_end(&drc);
+			end_err = dmu_recv_end(&drc, zfsvfs);
 			if (error == 0)
-				error = zfs_resume_fs(zfsvfs, tofs);
+				error = zfs_resume_fs(zfsvfs, ds);
 			error = error ? error : end_err;
+#ifdef illumos
 			VFS_RELE(zfsvfs->z_vfs);
+#else
+			vfs_unbusy(zfsvfs->z_vfs);
+#endif
 		} else {
-			error = dmu_recv_end(&drc);
+			error = dmu_recv_end(&drc, NULL);
 		}
+
+		/* Set delayed properties now, after we're done receiving. */
+		if (delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+			    delayprops, errors);
+		}
 	}
 
+	if (delayprops != NULL) {
+		/*
+		 * Merge delayed props back in with initial props, in case
+		 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+		 * we have to make sure clear_received_props() includes
+		 * the delayed properties).
+		 *
+		 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+		 * using ASSERT() will be just like a VERIFY.
+		 */
+		ASSERT(nvlist_merge(props, delayprops, 0) == 0);
+		nvlist_free(delayprops);
+	}
+
+	/*
+	 * Now that all props, initial and delayed, are set, report the prop
+	 * errors to the caller.
+	 */
+	if (zc->zc_nvlist_dst_size != 0 &&
+	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+	    put_nvlist(zc, errors) != 0)) {
+		/*
+		 * Caller made zc->zc_nvlist_dst less than the minimum expected
+		 * size or supplied an invalid address.
+		 */
+		props_error = SET_ERROR(EINVAL);
+	}
+
 	zc->zc_cookie = off - fp->f_offset;
 	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
@@ -4186,8 +4582,10 @@
  * zc_fromobj	objsetid of incremental fromsnap (may be zero)
  * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
  *		output size in zc_objset_type.
+ * zc_flags	lzc_send_flags
  *
- * outputs: none
+ * outputs:
+ * zc_objset_type	estimated size, if zc_guid is set
  */
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
@@ -4195,6 +4593,8 @@
 	int error;
 	offset_t off;
 	boolean_t estimate = (zc->zc_guid != 0);
+	boolean_t embedok = (zc->zc_flags & 0x1);
+	boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
 	if (zc->zc_obj != 0) {
 		dsl_pool_t *dp;
@@ -4211,7 +4611,8 @@
 		}
 
 		if (dsl_dir_is_clone(tosnap->ds_dir))
-			zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj;
+			zc->zc_fromobj =
+			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	}
@@ -4249,16 +4650,25 @@
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	} else {
-		file_t *fp = getf(zc->zc_cookie);
+		file_t *fp;
+		cap_rights_t rights;
+
+#ifdef illumos
+		fp = getf(zc->zc_cookie);
+#else
+		fget_write(curthread, zc->zc_cookie,
+		    cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
 		if (fp == NULL)
 			return (SET_ERROR(EBADF));
 
 		off = fp->f_offset;
 		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+		    zc->zc_fromobj, embedok, large_block_ok,
 #ifdef illumos
-		    zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
+		    zc->zc_cookie, fp->f_vnode, &off);
 #else
-		    zc->zc_fromobj, zc->zc_cookie, fp, &off);
+		    zc->zc_cookie, fp, &off);
 #endif
 
 		if (off >= 0 && off <= MAXOFFSET_T)
@@ -4598,13 +5008,23 @@
 			 * objset needs to be closed & reopened (to grow the
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
+			dsl_dataset_t *ds, *newds;
+
+			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
-			if (error == 0)
-				error = zfs_resume_fs(zfsvfs, zc->zc_name);
+			if (error == 0) {
+				dmu_objset_refresh_ownership(ds, &newds,
+				    zfsvfs);
+				error = zfs_resume_fs(zfsvfs, newds);
+			}
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+#ifdef illumos
 		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
 	} else {
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
@@ -4618,7 +5038,7 @@
 	return (error);
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * We don't want to have a hard dependency
  * against some special symbols in sharefs
@@ -4636,10 +5056,10 @@
 ddi_modhandle_t nfs_mod;
 ddi_modhandle_t sharefs_mod;
 ddi_modhandle_t smbsrv_mod;
-#endif	/* sun */
+#endif	/* illumos */
 kmutex_t zfs_share_lock;
 
-#ifdef sun
+#ifdef illumos
 static int
 zfs_init_sharefs()
 {
@@ -4659,12 +5079,12 @@
 	}
 	return (0);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
-#ifdef sun
+#ifdef illumos
 	int error;
 	int opcode;
 
@@ -4755,9 +5175,9 @@
 
 	return (error);
 
-#else	/* !sun */
+#else	/* !illumos */
 	return (ENOSYS);
-#endif	/* !sun */
+#endif	/* illumos */
 }
 
 ace_t full_access[] = {
@@ -4783,7 +5203,7 @@
 		return (error);
 
 	error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
-	    os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+	    dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
 
 	dmu_objset_rele(os, FTAG);
 	return (error);
@@ -4837,10 +5257,16 @@
 zfs_ioc_diff(zfs_cmd_t *zc)
 {
 	file_t *fp;
+	cap_rights_t rights;
 	offset_t off;
 	int error;
 
+#ifdef illumos
 	fp = getf(zc->zc_cookie);
+#else
+	fget_write(curthread, zc->zc_cookie,
+		    cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
@@ -4859,7 +5285,7 @@
 	return (error);
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * Remove all ACL files in shares dir
  */
@@ -4881,12 +5307,12 @@
 	zap_cursor_fini(&zc);
 	return (error);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 static int
 zfs_ioc_smb_acl(zfs_cmd_t *zc)
 {
-#ifdef sun
+#ifdef illumos
 	vnode_t *vp;
 	znode_t *dzp;
 	vnode_t *resourcevp = NULL;
@@ -4977,6 +5403,7 @@
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
 			VN_RELE(vp);
+			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
@@ -5009,9 +5436,9 @@
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
-#else	/* !sun */
+#else	/* !illumos */
 	return (EOPNOTSUPP);
-#endif	/* !sun */
+#endif	/* illumos */
 }
 
 /*
@@ -5029,6 +5456,7 @@
 static int
 zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
+	nvpair_t *pair;
 	nvlist_t *holds;
 	int cleanup_fd = -1;
 	int error;
@@ -5038,6 +5466,19 @@
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
+	/* make sure the user didn't pass us any invalid (empty) tags */
+	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		char *htag;
+
+		error = nvpair_value_string(pair, &htag);
+		if (error != 0)
+			return (SET_ERROR(error));
+
+		if (strlen(htag) == 0)
+			return (SET_ERROR(EINVAL));
+	}
+
 	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
 		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (error != 0)
@@ -5151,11 +5592,19 @@
 		return (error);
 
 	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+	if (error == 0 && !new->ds_is_snapshot) {
+		dsl_dataset_rele(new, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+	if (error == 0 && !old->ds_is_snapshot) {
+		dsl_dataset_rele(old, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
@@ -5192,6 +5641,12 @@
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
+ *     (optional) "embedok" -> (value ignored)
+ *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
  * }
  *
  * outnvl is unused
@@ -5200,10 +5655,16 @@
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	cap_rights_t rights;
+	file_t *fp;
 	int error;
 	offset_t off;
 	char *fromname = NULL;
 	int fd;
+	boolean_t largeblockok;
+	boolean_t embedok;
+	uint64_t resumeobj = 0;
+	uint64_t resumeoff = 0;
 
 	error = nvlist_lookup_int32(innvl, "fd", &fd);
 	if (error != 0)
@@ -5211,15 +5672,26 @@
 
 	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
+	largeblockok = nvlist_exists(innvl, "largeblockok");
+	embedok = nvlist_exists(innvl, "embedok");
+
+	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
+#ifdef illumos
 	file_t *fp = getf(fd);
+#else
+	fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = fp->f_offset;
+	error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
 #ifdef illumos
-	error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
+	    resumeobj, resumeoff, fp->f_vnode, &off);
 #else
-	error = dmu_send(snapname, fromname, fd, fp, &off);
+	    resumeobj, resumeoff, fp, &off);
 #endif
 
 #ifdef illumos
@@ -5238,7 +5710,8 @@
  * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
  *
  * innvl: {
- *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "from" -> full snap or bookmark name to send an incremental
+ *                          from
  * }
  *
  * outnvl: {
@@ -5249,7 +5722,6 @@
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
-	dsl_dataset_t *fromsnap = NULL;
 	dsl_dataset_t *tosnap;
 	int error;
 	char *fromname;
@@ -5265,27 +5737,55 @@
 		return (error);
 	}
 
-	error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
+	error = nvlist_lookup_string(innvl, "from", &fromname);
 	if (error == 0) {
-		error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
-		if (error != 0) {
-			dsl_dataset_rele(tosnap, FTAG);
-			dsl_pool_rele(dp, FTAG);
-			return (error);
+		if (strchr(fromname, '@') != NULL) {
+			/*
+			 * If from is a snapshot, hold it and use the more
+			 * efficient dmu_send_estimate to estimate send space
+			 * size using deadlists.
+			 */
+			dsl_dataset_t *fromsnap;
+			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+			if (error != 0)
+				goto out;
+			error = dmu_send_estimate(tosnap, fromsnap, &space);
+			dsl_dataset_rele(fromsnap, FTAG);
+		} else if (strchr(fromname, '#') != NULL) {
+			/*
+			 * If from is a bookmark, fetch the creation TXG of the
+			 * snapshot it was created from and use that to find
+			 * blocks that were born after it.
+			 */
+			zfs_bookmark_phys_t frombm;
+
+			error = dsl_bookmark_lookup(dp, fromname, tosnap,
+			    &frombm);
+			if (error != 0)
+				goto out;
+			error = dmu_send_estimate_from_txg(tosnap,
+			    frombm.zbm_creation_txg, &space);
+		} else {
+			/*
+			 * from is not properly formatted as a snapshot or
+			 * bookmark
+			 */
+			error = SET_ERROR(EINVAL);
+			goto out;
 		}
+	} else {
+		// If estimating the size of a full send, use dmu_send_estimate
+		error = dmu_send_estimate(tosnap, NULL, &space);
 	}
 
-	error = dmu_send_estimate(tosnap, fromsnap, &space);
 	fnvlist_add_uint64(outnvl, "space", space);
 
-	if (fromsnap != NULL)
-		dsl_dataset_rele(fromsnap, FTAG);
+out:
 	dsl_dataset_rele(tosnap, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
-
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
@@ -5385,7 +5885,7 @@
 
 static void
 zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-	zfs_secpolicy_func_t *secpolicy)
+    zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
@@ -5437,6 +5937,23 @@
 	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
 
+	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
+
+	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+	    POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -5496,10 +6013,10 @@
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
-	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED);
+	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
 	    zfs_ioc_dsobj_to_dsname,
-	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED);
+	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
 	    zfs_ioc_pool_get_history,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
@@ -5508,7 +6025,7 @@
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
-	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
 
@@ -5548,8 +6065,6 @@
 	    zfs_secpolicy_none);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
 	    zfs_secpolicy_destroy);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_ROLLBACK, zfs_ioc_rollback,
-	    zfs_secpolicy_rollback);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
 	    zfs_secpolicy_rename);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
@@ -5577,6 +6092,9 @@
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
 	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+	    POOL_CHECK_NONE, B_FALSE, B_FALSE);
 #endif
 }
 
@@ -5676,7 +6194,7 @@
 {
 	int error = 0;
 
-#ifdef sun
+#ifdef illumos
 	if (getminor(*devp) != 0)
 		return (zvol_open(devp, flag, otyp, cr));
 #endif
@@ -5723,6 +6241,7 @@
 	zfs_iocparm_t *zc_iocparm;
 	int cflag, cmd, oldvecnum;
 	boolean_t newioc, compat;
+	void *compat_zc = NULL;
 	cred_t *cr = td->td_ucred;
 #endif
 	const zfs_ioc_vec_t *vec;
@@ -5731,10 +6250,10 @@
 
 	cflag = ZFS_CMD_COMPAT_NONE;
 	compat = B_FALSE;
-	newioc = B_TRUE;
+	newioc = B_TRUE;	/* "new" style (zfs_iocparm_t) ioctl */
 
 	len = IOCPARM_LEN(zcmd);
-	cmd = zcmd & 0xff;
+	vecnum = cmd = zcmd & 0xff;
 
 	/*
 	 * Check if we are talking to supported older binaries
@@ -5742,25 +6261,41 @@
 	 */
 	if (len != sizeof(zfs_iocparm_t)) {
 		newioc = B_FALSE;
-		if (len == sizeof(zfs_cmd_t)) {
+		compat = B_TRUE;
+
+		vecnum = cmd;
+
+		switch (len) {
+		case sizeof(zfs_cmd_zcmd_t):
 			cflag = ZFS_CMD_COMPAT_LZC;
-			vecnum = cmd;
-		} else if (len == sizeof(zfs_cmd_deadman_t)) {
+			break;
+		case sizeof(zfs_cmd_deadman_t):
 			cflag = ZFS_CMD_COMPAT_DEADMAN;
-			compat = B_TRUE;
-			vecnum = cmd;
-		} else if (len == sizeof(zfs_cmd_v28_t)) {
+			break;
+		case sizeof(zfs_cmd_v28_t):
 			cflag = ZFS_CMD_COMPAT_V28;
-			compat = B_TRUE;
-			vecnum = cmd;
-		} else if (len == sizeof(zfs_cmd_v15_t)) {
+			break;
+		case sizeof(zfs_cmd_v15_t):
+			if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
+			    sizeof(zfs_ioctl_v15_to_v28[0]))
+				return (EINVAL);
+
 			cflag = ZFS_CMD_COMPAT_V15;
-			compat = B_TRUE;
 			vecnum = zfs_ioctl_v15_to_v28[cmd];
-		} else
+
+			/*
+			 * Return without further handling
+			 * if the command is blacklisted.
+			 */
+			if (vecnum == ZFS_IOC_COMPAT_PASS)
+				return (0);
+			else if (vecnum == ZFS_IOC_COMPAT_FAIL)
+				return (ENOTSUP);
+			break;
+		default:
 			return (EINVAL);
-	} else
-		vecnum = cmd;
+		}
+	}
 
 #ifdef illumos
 	vecnum = cmd - ZFS_IOC_FIRST;
@@ -5767,26 +6302,13 @@
 	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
 #endif
 
-	if (compat) {
-		if (vecnum == ZFS_IOC_COMPAT_PASS)
-			return (0);
-		else if (vecnum == ZFS_IOC_COMPAT_FAIL)
-			return (ENOTSUP);
-	}
-
-	/*
-	 * Check if we have sufficient kernel memory allocated
-	 * for the zfs_cmd_t request.  Bail out if not so we
-	 * will not access undefined memory region.
-	 */
 	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (SET_ERROR(EINVAL));
 	vec = &zfs_ioc_vec[vecnum];
 
-#ifdef illumos
 	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
-	bzero(zc, sizeof(zfs_cmd_t));
 
+#ifdef illumos
 	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
 	if (error != 0) {
 		error = SET_ERROR(EFAULT);
@@ -5793,33 +6315,86 @@
 		goto out;
 	}
 #else	/* !illumos */
-	/*
-	 * We don't alloc/free zc only if talking to library ioctl version 2
-	 */
-	if (cflag != ZFS_CMD_COMPAT_LZC) {
-		zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
-		bzero(zc, sizeof(zfs_cmd_t));
-	} else {
-		zc = (void *)arg;
-		error = 0;
-	}
+	bzero(zc, sizeof(zfs_cmd_t));
 
 	if (newioc) {
 		zc_iocparm = (void *)arg;
-		if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
-			error = SET_ERROR(EFAULT);
+
+		switch (zc_iocparm->zfs_ioctl_version) {
+		case ZFS_IOCVER_CURRENT:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
+				error = SET_ERROR(EINVAL);
+				goto out;
+			}
+			break;
+		case ZFS_IOCVER_INLANES:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_INLANES;
+			break;
+		case ZFS_IOCVER_RESUME:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_RESUME;
+			break;
+		case ZFS_IOCVER_EDBP:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_EDBP;
+			break;
+		case ZFS_IOCVER_ZCMD:
+			if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
+			    zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_ZCMD;
+			break;
+		default:
+			error = SET_ERROR(EINVAL);
 			goto out;
+			/* NOTREACHED */
 		}
-		error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, zc,
-		    sizeof(zfs_cmd_t), flag);
-		if (error != 0) {
-			error = SET_ERROR(EFAULT);
-			goto out;
+
+		if (compat) {
+			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+			compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+			bzero(compat_zc, sizeof(zfs_cmd_t));
+
+			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    compat_zc, zc_iocparm->zfs_cmd_size, flag);
+			if (error != 0) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+		} else {
+			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    zc, zc_iocparm->zfs_cmd_size, flag);
+			if (error != 0) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
 		}
 	}
 
 	if (compat) {
-		zfs_cmd_compat_get(zc, arg, cflag);
+		if (newioc) {
+			ASSERT(compat_zc != NULL);
+			zfs_cmd_compat_get(zc, compat_zc, cflag);
+		} else {
+			ASSERT(compat_zc == NULL);
+			zfs_cmd_compat_get(zc, arg, cflag);
+		}
 		oldvecnum = vecnum;
 		error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
 		if (error != 0)
@@ -5867,7 +6442,7 @@
 		break;
 	}
 
-	if (error == 0 && !(flag & FKIOCTL))
+	if (error == 0)
 		error = vec->zvec_secpolicy(zc, innvl, cr);
 
 	if (error != 0)
@@ -5874,7 +6449,7 @@
 		goto out;
 
 	/* legacy ioctls can modify zc_name */
-	len = strcspn(zc->zc_name, "/@") + 1;
+	len = strcspn(zc->zc_name, "/@#") + 1;
 	saved_poolname = kmem_alloc(len, KM_SLEEP);
 	(void) strlcpy(saved_poolname, zc->zc_name, len);
 
@@ -5915,7 +6490,7 @@
 		fnvlist_free(lognv);
 
 		/* rewrite outnvl for backwards compatibility */
-		if (cflag != ZFS_CMD_COMPAT_NONE && cflag != ZFS_CMD_COMPAT_LZC)
+		if (compat)
 			outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
 			    cflag);
 
@@ -5940,17 +6515,30 @@
 out:
 	nvlist_free(innvl);
 
-	if (compat) {
-		zfs_ioctl_compat_post(zc, cmd, cflag);
-		zfs_cmd_compat_put(zc, arg, vecnum, cflag);
-	}
-
 #ifdef illumos
 	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
 	if (error == 0 && rc != 0)
 		error = SET_ERROR(EFAULT);
 #else
-	if (newioc) {
+	if (compat) {
+		zfs_ioctl_compat_post(zc, cmd, cflag);
+		if (newioc) {
+			ASSERT(compat_zc != NULL);
+			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+
+			zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
+			rc = ddi_copyout(compat_zc,
+			    (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    zc_iocparm->zfs_cmd_size, flag);
+			if (error == 0 && rc != 0)
+				error = SET_ERROR(EFAULT);
+			kmem_free(compat_zc, sizeof (zfs_cmd_t));
+		} else {
+			zfs_cmd_compat_put(zc, arg, vecnum, cflag);
+		}
+	} else {
+		ASSERT(newioc);
+
 		rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
 		    sizeof (zfs_cmd_t), flag);
 		if (error == 0 && rc != 0)
@@ -5967,19 +6555,11 @@
 			strfree(saved_poolname);
 	}
 
-#ifdef illumos
 	kmem_free(zc, sizeof (zfs_cmd_t));
-#else
-	/*
-	 * We don't alloc/free zc only if talking to library ioctl version 2
-	 */
-	if (cflag != ZFS_CMD_COMPAT_LZC)
-		kmem_free(zc, sizeof (zfs_cmd_t));
-#endif
 	return (error);
 }
 
-#ifdef sun
+#ifdef illumos
 static int
 zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
@@ -6030,7 +6610,7 @@
 
 	return (DDI_FAILURE);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 /*
  * OK, so this is a little weird.
@@ -6041,7 +6621,7 @@
  * /dev/zfs has basically nothing to do except serve up ioctls,
  * so most of the standard driver entry points are in zvol.c.
  */
-#ifdef sun
+#ifdef illumos
 static struct cb_ops zfs_cb_ops = {
 	zfsdev_open,	/* open */
 	zfsdev_close,	/* close */
@@ -6090,7 +6670,7 @@
 	(void *)&zfs_modldrv,
 	NULL
 };
-#endif	/* sun */
+#endif	/* illumos */
 
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
@@ -6123,7 +6703,7 @@
 static struct root_hold_token *zfs_root_token;
 struct proc *zfsproc;
 
-#ifdef sun
+#ifdef illumos
 int
 _init(void)
 {
@@ -6186,56 +6766,111 @@
 {
 	return (mod_info(&modlinkage, modinfop));
 }
-#endif	/* sun */
+#endif	/* illumos */
 
-static int
-zfs_modevent(module_t mod, int type, void *unused __unused)
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+
+#ifdef __FreeBSD__
+#define ZFS_MIN_KSTACK_PAGES 4
+#endif
+
+int
+zfs__init(void)
 {
-	int error = 0;
 
-	switch (type) {
-	case MOD_LOAD:
-		zfs_root_token = root_mount_hold("ZFS");
+#ifdef __FreeBSD__
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+	    "overflow panic!\nPlease consider adding "
+	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+	    ZFS_MIN_KSTACK_PAGES);
+#endif
+#endif
+	zfs_root_token = root_mount_hold("ZFS");
 
-		mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 
-		spa_init(FREAD | FWRITE);
-		zfs_init();
-		zvol_init();
-		zfs_ioctl_init();
+	spa_init(FREAD | FWRITE);
+	zfs_init();
+	zvol_init();
+	zfs_ioctl_init();
 
-		tsd_create(&zfs_fsyncer_key, NULL);
-		tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
-		tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+	tsd_create(&zfs_fsyncer_key, NULL);
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+	tsd_create(&zfs_geom_probe_vdev_key, NULL);
 
-		printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
-		root_mount_rel(zfs_root_token);
+	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
+	root_mount_rel(zfs_root_token);
 
-		zfsdev_init();
-		break;
-	case MOD_UNLOAD:
-		if (spa_busy() || zfs_busy() || zvol_busy() ||
-		    zio_injection_enabled) {
-			error = EBUSY;
-			break;
-		}
+	zfsdev_init();
 
-		zfsdev_fini();
-		zvol_fini();
-		zfs_fini();
-		spa_fini();
+	return (0);
+}
 
-		tsd_destroy(&zfs_fsyncer_key);
-		tsd_destroy(&rrw_tsd_key);
-		tsd_destroy(&zfs_allow_log_key);
+int
+zfs__fini(void)
+{
+	if (spa_busy() || zfs_busy() || zvol_busy() ||
+	    zio_injection_enabled) {
+		return (EBUSY);
+	}
 
-		mutex_destroy(&zfs_share_lock);
-		break;
+	zfsdev_fini();
+	zvol_fini();
+	zfs_fini();
+	spa_fini();
+
+	tsd_destroy(&zfs_fsyncer_key);
+	tsd_destroy(&rrw_tsd_key);
+	tsd_destroy(&zfs_allow_log_key);
+
+	mutex_destroy(&zfs_share_lock);
+
+	return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+	/*
+	 * ZFS fini routines can not properly work in a panic-ed system.
+	 */
+	if (panicstr == NULL)
+		(void)zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+	int err;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = zfs__init();
+		if (err == 0)
+			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+			    shutdown_post_sync, zfs_shutdown, NULL,
+			    SHUTDOWN_PRI_FIRST);
+		return (err);
+	case MOD_UNLOAD:
+		err = zfs__fini();
+		if (err == 0 && zfs_shutdown_event_tag != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+			    zfs_shutdown_event_tag);
+		return (err);
+	case MOD_SHUTDOWN:
+		return (0);
 	default:
-		error = EOPNOTSUPP;
 		break;
 	}
-	return (error);
+	return (EOPNOTSUPP);
 }
 
 static moduledata_t zfs_mod = {
@@ -6247,3 +6882,4 @@
 MODULE_VERSION(zfsctrl, 1);
 MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
 MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,6 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/types.h>
@@ -349,7 +351,7 @@
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *dzp, char *name, uint64_t foid)
+    znode_t *dzp, char *name, uint64_t foid)
 {
 	itx_t *itx;
 	lr_remove_t *lr;
@@ -373,7 +375,7 @@
  */
 void
 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *dzp, znode_t *zp, char *name)
+    znode_t *dzp, znode_t *zp, char *name)
 {
 	itx_t *itx;
 	lr_link_t *lr;
@@ -428,7 +430,7 @@
  */
 void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
 	itx_t *itx;
 	lr_rename_t *lr;
@@ -456,23 +458,20 @@
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+    znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
+	uint32_t blocksize = zp->z_blksz;
 	itx_wr_state_t write_state;
-	boolean_t slogging;
 	uintptr_t fsync_cnt;
-	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zfs_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= zfs_immediate_write_sz)
+		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
 	else
@@ -485,30 +484,26 @@
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * If the write would overflow the largest block then split it.
-		 */
-		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-			len = SPA_MAXBLOCKSIZE >> 1;
-		else
-			len = resid;
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;
@@ -533,7 +528,7 @@
  */
 void
 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, uint64_t off, uint64_t len)
+    znode_t *zp, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
@@ -556,7 +551,7 @@
  */
 void
 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
 	itx_t		*itx;
 	lr_setattr_t	*lr;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -123,10 +123,11 @@
 {
 	file_t *fp, *tmpfp;
 	zfs_onexit_t *zo;
+	cap_rights_t rights;
 	void *data;
 	int error;
 
-	fp = getf(fd);
+	fp = getf(fd, cap_rights_init(&rights));
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
@@ -137,7 +138,7 @@
 		*minorp = (minor_t)(uintptr_t)data;
 	curthread->td_fpop = tmpfp;
 	if (error != 0)
-		return (error);
+		return (SET_ERROR(EBADF));
 	return (zfs_onexit_minor_to_state(*minorp, &zo));
 }
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -55,7 +55,7 @@
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
-	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+    uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
 	VATTR_NULL(vap);
 	vap->va_mask = (uint_t)mask;
@@ -821,7 +821,7 @@
 static int
 zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
 {
-#ifdef sun
+#ifdef illumos
 	znode_t *zp;
 	flock64_t fl;
 	int error;
@@ -844,10 +844,10 @@
 	VN_RELE(ZTOV(zp));
 
 	return (error);
-#else	/* !sun */
+#else
 	ZFS_LOG(0, "Unexpected code path, report to pjd at FreeBSD.org");
 	return (EOPNOTSUPP);
-#endif	/* !sun */
+#endif
 }
 
 static int
@@ -906,11 +906,15 @@
 	return (error);
 }
 
+extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct);
+
 static int
 zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
 {
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 	vsecattr_t vsa;
+	vnode_t *vp;
 	znode_t *zp;
 	int error;
 
@@ -929,13 +933,12 @@
 	vsa.vsa_aclflags = 0;
 	vsa.vsa_aclentp = ace;
 
-#ifdef TODO
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
-#else
-	panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
+	vp = ZTOV(zp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+	VOP_UNLOCK(vp, 0);
 
-	VN_RELE(ZTOV(zp));
+	VN_RELE(vp);
 
 	return (error);
 }
@@ -960,6 +963,7 @@
 	ace_t *ace = (ace_t *)(lr + 1);
 	vsecattr_t vsa;
 	znode_t *zp;
+	vnode_t *vp;
 	int error;
 
 	if (byteswap) {
@@ -975,7 +979,6 @@
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
-#ifdef TODO
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
@@ -992,16 +995,16 @@
 		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
 	}
 
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+	vp = ZTOV(zp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+	VOP_UNLOCK(vp, 0);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-#else
-	error = EOPNOTSUPP;
-#endif
 
 	zfsvfs->z_fuid_replay = NULL;
-	VN_RELE(ZTOV(zp));
+	VN_RELE(vp);
 
 	return (error);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,8 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#include <sys/types.h>
-#include <sys/param.h>
+#include <sys/zfs_context.h>
 #include <sys/vnode.h>
 #include <sys/sa.h>
 #include <sys/zfs_acl.h>
@@ -126,7 +125,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa) {
 		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -160,7 +159,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	xoptattr_t *xoap;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
 	if (zp->z_is_sa)
 		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -207,7 +206,6 @@
 	uint64_t crtime[2], mtime[2], ctime[2];
 	zfs_acl_phys_t znode_acl;
 	char scanstamp[AV_SCANSTAMP_SZ];
-	boolean_t drop_lock = B_FALSE;
 
 	/*
 	 * No upgrade if ACL isn't cached
@@ -219,8 +217,8 @@
 		return;
 
 	/*
-	 * If the z_lock is held and we aren't the owner
-	 * the just return since we don't want to deadlock
+	 * If the vnode lock is held and we aren't the owner
+	 * then just return since we don't want to deadlock
 	 * trying to update the status of z_is_sa.  This
 	 * file can then be upgraded at a later time.
 	 *
@@ -227,12 +225,8 @@
 	 * Otherwise, we know we are doing the
 	 * sa_update() that caused us to enter this function.
 	 */
-	if (mutex_owner(&zp->z_lock) != curthread) {
-		if (mutex_tryenter(&zp->z_lock) == 0)
+	if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
 			return;
-		else
-			drop_lock = B_TRUE;
-	}
 
 	/* First do a bulk query of the attributes that aren't cached */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -313,8 +307,7 @@
 
 	zp->z_is_sa = B_TRUE;
 done:
-	if (drop_lock)
-		mutex_exit(&zp->z_lock);
+	VOP_UNLOCK(ZTOV(zp), 0);
 }
 
 void

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel at dawidek.net>.
  * All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -134,6 +135,13 @@
 	if (panicstr)
 		return (0);
 
+	/*
+	 * Ignore the system syncher.  ZFS already commits async data
+	 * at zfs_txg_timeout intervals.
+	 */
+	if (waitfor == MNT_LAZY)
+		return (0);
+
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
@@ -174,7 +182,7 @@
 	return (0);
 }
 
-#ifndef __FreeBSD__
+#ifndef __FreeBSD_kernel__
 static int
 zfs_create_unique_device(dev_t *dev)
 {
@@ -226,7 +234,7 @@
 
 	return (0);
 }
-#endif	/* !__FreeBSD__ */
+#endif	/* !__FreeBSD_kernel__ */
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
@@ -272,11 +280,10 @@
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
 
-	if (newval < SPA_MINBLOCKSIZE ||
-	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-		newval = SPA_MAXBLOCKSIZE;
-
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
@@ -468,6 +475,19 @@
 	}
 
 	/*
+	 * We need to enter pool configuration here, so that we can use
+	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
+	 * dsl_prop_get_integer() can not be used, because it has to acquire
+	 * spa_namespace_lock and we can not do that because we already hold
+	 * z_teardown_lock.  The problem is that spa_config_sync() is called
+	 * with spa_namespace_lock held and the function calls ZFS vnode
+	 * operations to write the cache file and thus z_teardown_lock is
+	 * acquired after spa_namespace_lock.
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
@@ -478,14 +498,9 @@
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
-	} else {
-		char osname[MAXNAMELEN];
-
-		dmu_objset_name(os, osname);
-		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
-		    NULL)) {
-			return (error);
-		}
+	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		return (error);
 	}
 
 	/*
@@ -495,8 +510,6 @@
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
-	ds = dmu_objset_ds(os);
-	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	error = dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
@@ -545,35 +558,7 @@
 	return (0);
 
 unregister:
-	/*
-	 * We may attempt to unregister some callbacks that are not
-	 * registered, but this is OK; it will simply return ENOMSG,
-	 * which we will ignore.
-	 */
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME),
-	    atime_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR),
-	    xattr_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
-	    blksz_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY),
-	    readonly_changed_cb, zfsvfs);
-#ifdef illumos
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES),
-	    devices_changed_cb, zfsvfs);
-#endif
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID),
-	    setuid_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC),
-	    exec_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR),
-	    snapdir_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE),
-	    acl_mode_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT),
-	    acl_inherit_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN),
-	    vscan_changed_cb, zfsvfs);
+	dsl_prop_unregister_all(ds, zfsvfs);
 	return (error);
 }
 
@@ -863,61 +848,46 @@
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
-int
-zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 {
-	objset_t *os;
-	zfsvfs_t *zfsvfs;
-	uint64_t zval;
-	int i, error;
-	uint64_t sa_obj;
+	int error;
+	uint64_t val;
 
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-
-	/*
-	 * We claim to always be readonly so we can open snapshots;
-	 * other ZPL code will prevent us from writing to snapshots.
-	 */
-	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
-	if (error) {
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		return (error);
-	}
-
-	/*
-	 * Initialize the zfs-specific filesystem structure.
-	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero up to z_hold_mtx[].
-	 */
-	zfsvfs->z_vfs = NULL;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
-	if (error) {
-		goto out;
-	} else if (zfsvfs->z_version >
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
-		error = SET_ERROR(ENOTSUP);
-		goto out;
+		return (SET_ERROR(ENOTSUP));
 	}
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
-		goto out;
-	zfsvfs->z_norm = (int)zval;
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
 
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
-		goto out;
-	zfsvfs->z_utf8 = (zval != 0);
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
 
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
-		goto out;
-	zfsvfs->z_case = (uint_t)zval;
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
@@ -930,24 +900,19 @@
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
+	uint64_t sa_obj = 0;
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
-		if (error)
+		if (error != 0)
 			return (error);
-	} else {
-		/*
-		 * Pre SA versions file systems should never touch
-		 * either the attribute registration or layout objects.
-		 */
-		sa_obj = 0;
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
@@ -954,55 +919,117 @@
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
 
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+	return (0);
+}
+
+int
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	/*
+	 * XXX: Fix struct statfs so this isn't necessary!
+	 *
+	 * The 'osname' is used as the filesystem's special node, which means
+	 * it must fit in statfs.f_mntfromname, or else it can't be
+	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+	 * 'zfs unmount' to think it's not mounted when it is.
+	 */
+	if (strlen(osname) >= MNAMELEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	/*
+	 * We claim to always be readonly so we can open snapshots;
+	 * other ZPL code will prevent us from writing to snapshots.
+	 */
+	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+	if (error) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
+
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
-	rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#ifdef DIAGNOSTIC
+	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, zfsvfs);
+		*zfvp = NULL;
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
 	*zfvp = zfsvfs;
 	return (0);
-
-out:
-	dmu_objset_disown(os, zfsvfs);
-	*zfvp = NULL;
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
-	return (error);
 }
 
 static int
@@ -1014,13 +1041,6 @@
 	if (error)
 		return (error);
 
-	/*
-	 * Set the objset user_ptr to track its zfsvfs.
-	 */
-	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
-	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
@@ -1081,6 +1101,13 @@
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
 	return (0);
 }
 
@@ -1105,7 +1132,7 @@
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
-	rrw_destroy(&zfsvfs->z_teardown_lock);
+	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1173,10 +1200,10 @@
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
-	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
@@ -1225,9 +1252,6 @@
 	}
 
 	vfs_mountedfrom(vfsp, osname);
-	/* Grab extra reference. */
-	VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
-	VOP_UNLOCK(vp, 0);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
@@ -1236,7 +1260,7 @@
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
+		atomic_inc_32(&zfs_active_fs_count);
 	}
 
 	return (error);
@@ -1246,43 +1270,9 @@
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
-	struct dsl_dataset *ds;
 
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os)) {
-		ds = dmu_objset_ds(os);
-		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclinherit",
-		    acl_inherit_changed_cb, zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "vscan",
-		    vscan_changed_cb, zfsvfs) == 0);
-	}
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
 }
 
 #ifdef SECLABEL
@@ -1628,13 +1618,21 @@
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
-#else
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Get the objset name (the "special" mount argument).
+	 */
+	if (error = pn_get(uap->spec, fromspace, &spn))
+		return (error);
+
+	osname = spn.pn_path;
+#else	/* !illumos */
 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
 		return (SET_ERROR(EPERM));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 		return (SET_ERROR(EINVAL));
-#endif	/* ! illumos */
 
 	/*
 	 * If full-owner-access is enabled and delegated administration is
@@ -1644,6 +1642,7 @@
 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
+#endif	/* illumos */
 
 	/*
 	 * Check for mount privilege?
@@ -1706,9 +1705,19 @@
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT) {
-		/* refresh mount options */
-		zfs_unregister_callbacks(vfsp->vfs_data);
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		/*
+		 * Refresh mount options with z_teardown_lock blocking I/O while
+		 * the filesystem is in an inconsistent state.
+		 * The lock also serializes this code with filesystem
+		 * manipulations between entry to zfs_suspend_fs() and return
+		 * from zfs_resume_fs().
+		 */
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		zfs_unregister_callbacks(zfsvfs);
 		error = zfs_register_callbacks(vfsp);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		goto out;
 	}
 
@@ -1727,7 +1736,7 @@
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
 
-#ifdef sun
+#ifdef illumos
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
@@ -1734,7 +1743,7 @@
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
-#endif	/* sun */
+#endif
 
 out:
 	return (error);
@@ -1792,23 +1801,12 @@
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
-	statp->f_namemax = ZFS_MAXNAMELEN;
+	statp->f_namemax = MAXNAMELEN - 1;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-int
-zfs_vnode_lock(vnode_t *vp, int flags)
-{
-	int error;
-
-	ASSERT(vp != NULL);
-
-	error = vn_lock(vp, flags);
-	return (error);
-}
-
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
@@ -1816,7 +1814,7 @@
 	znode_t *rootzp;
 	int error;
 
-	ZFS_ENTER_NOERROR(zfsvfs);
+	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
@@ -1825,13 +1823,12 @@
 	ZFS_EXIT(zfsvfs);
 
 	if (error == 0) {
-		error = zfs_vnode_lock(*vpp, flags);
-		if (error == 0)
-			(*vpp)->v_vflag |= VV_ROOT;
+		error = vn_lock(*vpp, flags);
+		if (error != 0) {
+			VN_RELE(*vpp);
+			*vpp = NULL;
+		}
 	}
-	if (error != 0)
-		*vpp = NULL;
-
 	return (error);
 }
 
@@ -1846,7 +1843,7 @@
 {
 	znode_t	*zp;
 
-	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
@@ -1879,7 +1876,7 @@
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (SET_ERROR(EIO));
 	}
 
@@ -1906,7 +1903,7 @@
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
-		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
@@ -1966,15 +1963,6 @@
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
-		ret = vflush(vfsp, 0, 0, td);
-		ASSERT(ret == EBUSY);
-		if (!(fflag & MS_FORCE)) {
-			if (zfsvfs->z_ctldir->v_count > 1)
-				return (EBUSY);
-			ASSERT(zfsvfs->z_ctldir->v_count == 1);
-		}
-		zfsctl_destroy(zfsvfs);
-		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	if (fflag & MS_FORCE) {
@@ -1983,23 +1971,19 @@
 		 * vflush(FORCECLOSE). This way we ensure no future vnops
 		 * will be called and risk operating on DOOMED vnodes.
 		 */
-		rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfsvfs->z_unmounted = B_TRUE;
-		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
-	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
-	if (ret != 0) {
-		if (!zfsvfs->z_issnap) {
-			zfsctl_create(zfsvfs);
-			ASSERT(zfsvfs->z_ctldir != NULL);
-		}
+	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+	if (ret != 0)
 		return (ret);
-	}
 
+#ifdef illumos
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
@@ -2020,6 +2004,7 @@
 				return (SET_ERROR(EBUSY));
 		}
 	}
+#endif
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
@@ -2047,12 +2032,6 @@
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
-	if (zfsvfs->z_issnap) {
-		vnode_t *svp = vfsp->mnt_vnodecovered;
-
-		if (svp->v_count >= 2)
-			VN_RELE(svp);
-	}
 	zfs_freevfs(vfsp);
 
 	return (0);
@@ -2077,7 +2056,7 @@
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
@@ -2084,11 +2063,9 @@
 		*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	if (err == 0)
-		err = zfs_vnode_lock(*vpp, flags);
+		err = vn_lock(*vpp, flags);
 	if (err != 0)
 		*vpp = NULL;
-	else
-		(*vpp)->v_hash = ino;
 	return (err);
 }
 
@@ -2115,8 +2092,10 @@
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 {
+	struct componentname cn;
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
+	vnode_t		*dvp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
@@ -2171,21 +2150,32 @@
 	if ((fid_gen == 0 &&
 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
-		*vpp = zfsvfs->z_ctldir;
-		ASSERT(*vpp != NULL);
+		ZFS_EXIT(zfsvfs);
+		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
 		if (object == ZFSCTL_INO_SNAPDIR) {
-			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
-			    0, NULL, NULL, NULL, NULL, NULL) == 0);
+			cn.cn_nameptr = "snapshot";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN | LOCKLEAF;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
 		} else if (object == zfsvfs->z_shares_dir) {
-			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
-			    0, NULL, NULL, NULL, NULL, NULL) == 0);
+			/*
+			 * XXX This branch must not be taken,
+			 * if it is, then the lookup below will
+			 * explode.
+			 */
+			cn.cn_nameptr = "shares";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
 		} else {
-			VN_HOLD(*vpp);
+			*vpp = dvp;
 		}
-		ZFS_EXIT(zfsvfs);
-		err = zfs_vnode_lock(*vpp, flags);
-		if (err != 0)
-			*vpp = NULL;
 		return (err);
 	}
 
@@ -2203,7 +2193,7 @@
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
-		VN_RELE(ZTOV(zp));
+		vrele(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
@@ -2210,7 +2200,7 @@
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
-	err = zfs_vnode_lock(*vpp, flags | LK_RETRY);
+	err = vn_lock(*vpp, flags);
 	if (err == 0)
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
 	else
@@ -2222,7 +2212,9 @@
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
@@ -2231,84 +2223,70 @@
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
-	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
 	return (0);
 }
 
 /*
- * Reopen zfsvfs_t::z_os and release VOPs.
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
  */
 int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 {
 	int err;
+	znode_t *zp;
 
-	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
-	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
-	    &zfsvfs->z_os);
-	if (err) {
-		zfsvfs->z_os = NULL;
-	} else {
-		znode_t *zp;
-		uint64_t sa_obj = 0;
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	VERIFY0(dmu_objset_from_ds(ds, &os));
 
-		/*
-		 * Make sure version hasn't changed
-		 */
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
 
-		err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
-		    &zfsvfs->z_version);
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
-		if (err)
-			goto bail;
+	zfs_set_fuid_feature(zfsvfs);
 
-		err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
-		    ZFS_SA_ATTRS, 8, 1, &sa_obj);
-
-		if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
-			goto bail;
-
-		if ((err = sa_setup(zfsvfs->z_os, sa_obj,
-		    zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
-			goto bail;
-
-		if (zfsvfs->z_version >= ZPL_VERSION_SA)
-			sa_register_update_callback(zfsvfs->z_os,
-			    zfs_sa_upgrade);
-
-		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
-
-		zfs_set_fuid_feature(zfsvfs);
-
-		/*
-		 * Attempt to re-establish all the active znodes with
-		 * their dbufs.  If a zfs_rezget() fails, then we'll let
-		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
-		 * when they try to use their znode.
-		 */
-		mutex_enter(&zfsvfs->z_znodes_lock);
-		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
-		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
-			(void) zfs_rezget(zp);
-		}
-		mutex_exit(&zfsvfs->z_znodes_lock);
+	/*
+	 * Attempt to re-establish all the active znodes with
+	 * their dbufs.  If a zfs_rezget() fails, then we'll let
+	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+	 * when they try to use their znode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		(void) zfs_rezget(zp);
 	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
-	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
-		 * Since we couldn't reopen zfsvfs::z_os, or
-		 * setup the sa framework force unmount this file system.
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
 		 */
-		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+			vfs_ref(zfsvfs->z_vfs);
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+		}
 	}
 	return (err);
 }
@@ -2318,7 +2296,7 @@
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
-#ifdef sun
+#ifdef illumos
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
@@ -2327,11 +2305,11 @@
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
-#endif	/* sun */
+#endif
 
 	zfsvfs_free(zfsvfs);
 
-	atomic_add_32(&zfs_active_fs_count, -1);
+	atomic_dec_32(&zfs_active_fs_count);
 }
 
 #ifdef __i386__

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -34,6 +36,7 @@
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
+#include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
@@ -64,16 +67,14 @@
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
-#include <sys/sf_buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_param.h>
 
 /*
  * Programming rules.
@@ -105,12 +106,19 @@
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
- *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
- *	This is critical because we don't want to block while holding locks.
- *	Note, in particular, that if a lock is sometimes acquired before
- *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
- *	use a non-blocking assign can deadlock the system.  The scenario:
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
  *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
+ *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
@@ -117,7 +125,11 @@
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- *	then drop all locks, call dmu_tx_wait(), and try again.
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
@@ -135,16 +147,17 @@
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
- *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
+ *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
@@ -253,16 +266,19 @@
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
-	/* end of file? */
-	if ((error == ESRCH) || (noff > file_sz)) {
-		/*
-		 * Handle the virtual hole at the end of file.
-		 */
-		if (hole) {
-			*off = file_sz;
-			return (0);
-		}
+	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
+
+	/*
+	 * We could find a hole that begins after the logical end-of-file,
+	 * because dmu_offset_next() only works on whole blocks.  If the
+	 * EOF falls mid-block, then indicate that the "virtual hole"
+	 * at the end of the file begins at the logical EOF, rather than
+	 * at the end of the last block.
+	 */
+	if (noff > file_sz) {
+		ASSERT(hole);
+		noff = file_sz;
 	}
 
 	if (noff < *off)
@@ -277,6 +293,8 @@
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
+	offset_t ndata;
+	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
@@ -283,6 +301,7 @@
 
 	switch (com) {
 	case _FIOFFS:
+	{
 		return (0);
 
 		/*
@@ -289,13 +308,17 @@
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
+	}
 	case _FIOGDIO:
 	case _FIOSDIO:
+	{
 		return (0);
+	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
-#ifdef sun
+	{
+#ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
@@ -311,7 +334,7 @@
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
-#ifdef sun
+#ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
@@ -319,6 +342,48 @@
 #endif
 		return (0);
 	}
+#ifdef illumos
+	case _FIO_COUNT_FILLED:
+	{
+		/*
+		 * _FIO_COUNT_FILLED adds a new ioctl command which
+		 * exposes the number of filled blocks in a
+		 * ZFS object.
+		 */
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+
+		/*
+		 * Wait for all dirty blocks for this object
+		 * to get synced out to disk, and the DMU info
+		 * updated.
+		 */
+		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
+		if (error) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Retrieve fill count from DMU object.
+		 */
+		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
+		if (error) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		ndata = doi.doi_fill_count;
+
+		ZFS_EXIT(zfsvfs);
+		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
+			return (SET_ERROR(EFAULT));
+		return (0);
+	}
+#endif
+	}
 	return (SET_ERROR(ENOTTY));
 }
 
@@ -327,14 +392,28 @@
 {
 	vm_object_t obj;
 	vm_page_t pp;
+	int64_t end;
 
+	/*
+	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+	 * aligned boundaries, if the range is not aligned.  As a result a
+	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
+	 * the whole page would be considred clean despite have some dirty data.
+	 * For this reason we should shrink the range to DEV_BSIZE aligned
+	 * boundaries before calling vm_page_clear_dirty.
+	 */
+	end = rounddown2(off + nbytes, DEV_BSIZE);
+	off = roundup2(off, DEV_BSIZE);
+	nbytes = end - off;
+
 	obj = vp->v_object;
-	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
+	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
-			if ((pp->oflags & VPO_BUSY) != 0) {
+			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
@@ -341,13 +420,17 @@
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
-				vm_page_sleep(pp, "zfsmwb");
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
 				continue;
 			}
+			vm_page_sbusy(pp);
 		} else if (pp == NULL) {
 			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
 			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
-			    VM_ALLOC_NOBUSY);
+			    VM_ALLOC_SBUSY);
 		} else {
 			ASSERT(pp != NULL && !pp->valid);
 			pp = NULL;
@@ -356,9 +439,9 @@
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
-			vm_page_io_start(pp);
 			pmap_remove_write(pp);
-			vm_page_clear_dirty(pp, off, nbytes);
+			if (nbytes != 0)
+				vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
@@ -369,7 +452,7 @@
 page_unbusy(vm_page_t pp)
 {
 
-	vm_page_io_finish(pp);
+	vm_page_sunbusy(pp);
 	vm_object_pip_subtract(pp->object, 1);
 }
 
@@ -380,12 +463,12 @@
 	vm_page_t pp;
 
 	obj = vp->v_object;
-	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
+	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
-			if ((pp->oflags & VPO_BUSY) != 0) {
+			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
@@ -392,7 +475,10 @@
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
-				vm_page_sleep(pp, "zfsmwb");
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
 				continue;
 			}
 
@@ -417,21 +503,6 @@
 	vm_page_unlock(pp);
 }
 
-static caddr_t
-zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
-{
-
-	*sfp = sf_buf_alloc(pp, 0);
-	return ((caddr_t)sf_buf_kva(*sfp));
-}
-
-static void
-zfs_unmap_page(struct sf_buf *sf)
-{
-
-	sf_buf_free(sf);
-}
-
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
@@ -448,53 +519,33 @@
 	caddr_t va;
 	int off;
 
+	ASSERT(segflg != UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
-	VM_OBJECT_LOCK(obj);
+	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
-		if (segflg == UIO_NOCOPY) {
-			pp = vm_page_lookup(obj, OFF_TO_IDX(start));
-			KASSERT(pp != NULL,
-			    ("zfs update_pages: NULL page in putpages case"));
-			KASSERT(off == 0,
-			    ("zfs update_pages: unaligned data in putpages case"));
-			KASSERT(pp->valid == VM_PAGE_BITS_ALL,
-			    ("zfs update_pages: invalid page in putpages case"));
-			KASSERT(pp->busy > 0,
-			    ("zfs update_pages: unbusy page in putpages case"));
-			KASSERT(!pmap_page_is_write_mapped(pp),
-			    ("zfs update_pages: writable page in putpages case"));
-			VM_OBJECT_UNLOCK(obj);
+		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
-			(void) dmu_write(os, oid, start, nbytes, va, tx);
-			zfs_unmap_page(sf);
-
-			VM_OBJECT_LOCK(obj);
-			vm_page_undirty(pp);
-		} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
-			VM_OBJECT_UNLOCK(obj);
-
-			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
-			VM_OBJECT_LOCK(obj);
+			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
-	if (segflg != UIO_NOCOPY)
-		vm_object_pip_wakeupn(obj, 0);
-	VM_OBJECT_UNLOCK(obj);
+	vm_object_pip_wakeupn(obj, 0);
+	zfs_vmobject_wunlock(obj);
 }
 
 /*
@@ -502,7 +553,7 @@
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
- * all pages in advance, drain VPO_BUSY on all of them,
+ * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
@@ -526,15 +577,14 @@
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
-	VM_OBJECT_LOCK(obj);
+	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
-		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
-		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
+		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
+		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (pp->valid == 0) {
-			vm_page_io_start(pp);
-			VM_OBJECT_UNLOCK(obj);
+			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
@@ -541,16 +591,21 @@
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
-			VM_OBJECT_LOCK(obj);
-			vm_page_io_finish(pp);
+			zfs_vmobject_wlock(obj);
+			vm_page_sunbusy(pp);
 			vm_page_lock(pp);
 			if (error) {
-				vm_page_free(pp);
+				if (pp->wire_count == 0 && pp->valid == 0 &&
+				    !vm_page_busied(pp))
+					vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
+		} else {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
@@ -558,7 +613,7 @@
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
-	VM_OBJECT_UNLOCK(obj);
+	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
@@ -576,7 +631,6 @@
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
@@ -590,7 +644,7 @@
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
-	VM_OBJECT_LOCK(obj);
+	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
@@ -599,16 +653,21 @@
 			struct sf_buf *sf;
 			caddr_t va;
 
-			VM_OBJECT_UNLOCK(obj);
+			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
+#ifdef illumos
 			error = uiomove(va + off, bytes, UIO_READ, uio);
+#else
+			error = vn_io_fault_uiomove(va + off, bytes, uio);
+#endif
 			zfs_unmap_page(sf);
-			VM_OBJECT_LOCK(obj);
+			zfs_vmobject_wlock(obj);
 			page_unhold(pp);
 		} else {
-			VM_OBJECT_UNLOCK(obj);
-			error = dmu_read_uio(os, zp->z_id, uio, bytes);
-			VM_OBJECT_LOCK(obj);
+			zfs_vmobject_wunlock(obj);
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
@@ -615,7 +674,7 @@
 		if (error)
 			break;
 	}
-	VM_OBJECT_UNLOCK(obj);
+	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
@@ -644,7 +703,6 @@
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
@@ -652,7 +710,6 @@
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
-	os = zfsvfs->z_os;
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
@@ -710,7 +767,7 @@
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
-#ifdef sun
+#ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
@@ -739,7 +796,7 @@
 			}
 		}
 	}
-#endif	/* sun */
+#endif	/* illumos */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
@@ -750,10 +807,12 @@
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
-		if (vn_has_cached_data(vp))
+		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
-		else
-			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes);
+		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
@@ -839,6 +898,16 @@
 	    &zp->z_pflags, 8);
 
 	/*
+	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
+	 * callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
 	 * If immutable or not appending then return EPERM
 	 */
 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
@@ -869,7 +938,7 @@
 		return (error);
 	}
 
-#ifdef sun
+#ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
@@ -880,7 +949,7 @@
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif	/* sun */
+#endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
@@ -938,7 +1007,6 @@
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
-again:
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
@@ -990,13 +1058,8 @@
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
-		error = dmu_tx_assign(tx, TXG_NOWAIT);
+		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
-			if (error == ERESTART) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto again;
-			}
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
@@ -1013,8 +1076,14 @@
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
 				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+				new_blksz = MIN(end_size,
+				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
@@ -1112,7 +1181,11 @@
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
+#ifdef illumos
 			ASSERT(error == 0);
+#else
+			ASSERT(error == 0 || error == EFAULT);
+#endif
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
@@ -1122,7 +1195,10 @@
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
-		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		if (error == 0)
+			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		else
+			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
@@ -1132,10 +1208,10 @@
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
-#ifdef sun
+#ifdef illumos
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif	/* sun */
+#endif
 	}
 
 	zfs_range_unlock(rl);
@@ -1149,6 +1225,17 @@
 		return (error);
 	}
 
+#ifdef __FreeBSD__
+	/*
+	 * EFAULT means that at least one page of the source buffer was not
+	 * available.  VFS will re-try remaining I/O upon this error.
+	 */
+	if (error == EFAULT) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+#endif
+
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
@@ -1162,7 +1249,6 @@
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
-	int vfslocked;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
@@ -1169,7 +1255,6 @@
 
 	zfs_range_unlock(zgd->zgd_rl);
 
-	vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
@@ -1180,7 +1265,6 @@
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
-	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 #ifdef DEBUG
@@ -1246,7 +1330,7 @@
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
-		 * written out and it's checksum is being calculated
+		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
@@ -1334,27 +1418,83 @@
 	return (error);
 }
 
-/*
- * If vnode is for a device return a specfs vnode instead.
- */
 static int
-specvp_check(vnode_t **vpp, cred_t *cr)
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
-	int error = 0;
+	int error;
 
-	if (IS_DEVVP(*vpp)) {
-		struct vnode *svp;
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
+	return (error);
+}
 
-		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-		VN_RELE(*vpp);
-		if (svp == NULL)
-			error = SET_ERROR(ENOSYS);
-		*vpp = svp;
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+	if ((zdp->z_pflags & ZFS_XATTR) == 0)
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (dvp->v_iflag & VI_DOOMED) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
 	}
-	return (error);
 }
 
-
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
@@ -1366,8 +1506,6 @@
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
- *		direntflags - directory lookup flags
- *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
@@ -1382,46 +1520,17 @@
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
-	int *direntflags = NULL;
-	void *realpnp = NULL;
 
-	/* fast path */
-	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
-
+	/* fast path (should be redundant with vfs namecache) */
+	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
-
-		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
-			error = zfs_fastaccesschk_execute(zdp, cr);
-			if (!error) {
-				*vpp = dvp;
-				VN_HOLD(*vpp);
-				return (0);
-			}
-			return (error);
-		} else {
-			vnode_t *tvp = dnlc_lookup(dvp, nm);
-
-			if (tvp) {
-				error = zfs_fastaccesschk_execute(zdp, cr);
-				if (error) {
-					VN_RELE(tvp);
-					return (error);
-				}
-				if (tvp == DNLC_NO_VNODE) {
-					VN_RELE(tvp);
-					return (SET_ERROR(ENOENT));
-				} else {
-					*vpp = tvp;
-					return (specvp_check(vpp, cr));
-				}
-			}
-		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
@@ -1459,10 +1568,9 @@
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
-
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
-			VN_RELE(*vpp);
+			vrele(*vpp);
 			*vpp = NULL;
 		}
 
@@ -1470,15 +1578,9 @@
 		return (error);
 	}
 
-	if (dvp->v_type != VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOTDIR));
-	}
-
 	/*
 	 * Check accessibility of directory.
 	 */
-
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1490,10 +1592,98 @@
 		return (SET_ERROR(EILSEQ));
 	}
 
-	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0)
-		error = specvp_check(vpp, cr);
 
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			struct componentname cn;
+			vnode_t *zfsctl_vp;
+			int ltype;
+
+			ZFS_EXIT(zfsvfs);
+			ltype = VOP_ISLOCKED(dvp);
+			VOP_UNLOCK(dvp, 0);
+			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+			    &zfsctl_vp);
+			if (error == 0) {
+				cn.cn_nameptr = "snapshot";
+				cn.cn_namelen = strlen(cn.cn_nameptr);
+				cn.cn_nameiop = cnp->cn_nameiop;
+				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+				cn.cn_lkflags = cnp->cn_lkflags;
+				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+				vput(zfsctl_vp);
+			}
+			vn_lock(dvp, ltype | LK_RETRY);
+			return (error);
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		ZFS_EXIT(zfsvfs);
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			return (SET_ERROR(ENOTSUP));
+		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+		return (error);
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+out:
+	if (error != 0)
+		*vpp = NULL;
+
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
@@ -1511,42 +1701,20 @@
 			break;
 		}
 	}
-	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
-		int ltype = 0;
 
-		if (cnp->cn_flags & ISDOTDOT) {
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp, 0);
-		}
-		ZFS_EXIT(zfsvfs);
-		error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-			return (error);
-		}
-	} else {
-		ZFS_EXIT(zfsvfs);
-	}
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
 
-#ifdef FREEBSD_NAMECACHE
-	/*
-	 * Insert name into cache (as non-existent) if appropriate.
-	 */
-	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
-		cache_enter(dvp, *vpp, cnp);
-	/*
-	 * Insert name into cache if appropriate.
-	 */
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
-#endif
 
 	return (error);
 }
@@ -1564,7 +1732,7 @@
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
- *		vsecp 	- ACL to be set
+ *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
@@ -1584,7 +1752,6 @@
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
@@ -1592,9 +1759,9 @@
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
-	boolean_t	have_acl = B_FALSE;
 	void		*vsecp = NULL;
 	int		flag = 0;
+	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
@@ -1630,176 +1797,90 @@
 			return (error);
 		}
 	}
-top:
+
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
-	if (*name == '\0') {
-		/*
-		 * Null component name refers to the directory itself.
-		 */
-		VN_HOLD(dvp);
-		zp = dzp;
-		dl = NULL;
-		error = 0;
-	} else {
-		/* possible VN_HOLD(zp) */
-		int zflg = 0;
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
 
-		if (flag & FIGNORECASE)
-			zflg |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-		    NULL, NULL);
-		if (error) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			if (strcmp(name, "..") == 0)
-				error = SET_ERROR(EISDIR);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+		goto out;
 	}
 
-	if (zp == NULL) {
-		uint64_t txtype;
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
 
-		/*
-		 * Create a new file object and update the directory
-		 * to reference it.
-		 */
-		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			goto out;
-		}
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
 
-		/*
-		 * We only support the creation of regular files in
-		 * extended attribute directories.
-		 */
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
 
-		if ((dzp->z_pflags & ZFS_XATTR) &&
-		    (vap->va_type != VREG)) {
-			if (have_acl)
-				zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
 
-		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
-		    cr, vsecp, &acl_ids)) != 0)
-			goto out;
-		have_acl = B_TRUE;
+	getnewvnode_reserve(1);
 
-		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-			zfs_acl_ids_free(&acl_ids);
-			error = SET_ERROR(EDQUOT);
-			goto out;
-		}
+	tx = dmu_tx_create(os);
 
-		tx = dmu_tx_create(os);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
 
-		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-		    ZFS_SA_BASE_ATTR_SIZE);
-
-		fuid_dirtied = zfsvfs->z_fuid_dirty;
-		if (fuid_dirtied)
-			zfs_fuid_txhold(zfsvfs, tx);
-		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-		if (!zfsvfs->z_use_sa &&
-		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, acl_ids.z_aclp->z_acl_bytes);
-		}
-		error = dmu_tx_assign(tx, TXG_NOWAIT);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			if (error == ERESTART) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			zfs_acl_ids_free(&acl_ids);
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
-		if (fuid_dirtied)
-			zfs_fuid_sync(zfsvfs, tx);
-
-		(void) zfs_link_create(dl, zp, tx, ZNEW);
-		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
-		if (flag & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, acl_ids.z_fuidp, vap);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_commit(tx);
-	} else {
-		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-		if (have_acl)
-			zfs_acl_ids_free(&acl_ids);
-		have_acl = B_FALSE;
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 
-		/*
-		 * A directory entry already exists for this name.
-		 */
-		/*
-		 * Can't truncate an existing file if in exclusive mode.
-		 */
-		if (excl == EXCL) {
-			error = SET_ERROR(EEXIST);
-			goto out;
-		}
-		/*
-		 * Can't open a directory for writing.
-		 */
-		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
-			error = SET_ERROR(EISDIR);
-			goto out;
-		}
-		/*
-		 * Verify requested access to file.
-		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
-			goto out;
-		}
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
 
-		mutex_enter(&dzp->z_lock);
-		dzp->z_seq++;
-		mutex_exit(&dzp->z_lock);
+	getnewvnode_drop_reserve();
 
-		/*
-		 * Truncate regular files if requested.
-		 */
-		if ((ZTOV(zp)->v_type == VREG) &&
-		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
-			/* we can't hold any locks when calling zfs_freesp() */
-			zfs_dirent_unlock(dl);
-			dl = NULL;
-			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == 0) {
-				vnevent_create(ZTOV(zp), ct);
-			}
-		}
-	}
 out:
-	if (dl)
-		zfs_dirent_unlock(dl);
-
-	if (error) {
-		if (zp)
-			VN_RELE(ZTOV(zp));
-	} else {
+	if (error == 0) {
 		*vpp = ZTOV(zp);
-		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
@@ -1825,57 +1906,31 @@
  *	 vp - ctime (if nlink > 0)
  */
 
-uint64_t null_xattr = 0;
-
 /*ARGSUSED*/
 static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
-    int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
-	znode_t		*zp, *dzp = VTOZ(dvp);
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
-	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
-	uint64_t 	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
-	pathname_t	*realnmp = NULL;
-	pathname_t	realnm;
 	int		error;
-	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
+	zp = VTOZ(vp);
 
-	if (flags & FIGNORECASE) {
-		zflg |= ZCILOOK;
-		pn_alloc(&realnm);
-		realnmp = &realnm;
-	}
-
-top:
 	xattr_obj = 0;
 	xzp = NULL;
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, realnmp)) {
-		if (realnmp)
-			pn_free(realnmp);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
 
-	vp = ZTOV(zp);
-
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
@@ -1890,14 +1945,15 @@
 
 	vnevent_remove(vp, dvp, name, ct);
 
-	if (realnmp)
-		dnlc_remove(dvp, realnmp->pn_buf);
-	else
-		dnlc_remove(dvp, name);
+	obj = zp->z_id;
 
-	VI_LOCK(vp);
-	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
-	VI_UNLOCK(vp);
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
@@ -1905,51 +1961,27 @@
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
-	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	if (may_delete_now) {
-		toobig =
-		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
-		/* if the file is too big, only hold_free a token amount */
-		dmu_tx_hold_free(tx, zp->z_id, 0,
-		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
-	}
 
-	/* are there any extended attributes? */
-	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-	    &xattr_obj, sizeof (xattr_obj));
-	if (error == 0 && xattr_obj) {
-		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT0(error);
+	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
-	mutex_enter(&zp->z_lock);
-	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-	mutex_exit(&zp->z_lock);
-
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (xzp)
-			VN_RELE(ZTOV(xzp));
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		if (realnmp)
-			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1958,7 +1990,7 @@
 	/*
 	 * Remove the directory entry.
 	 */
-	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
@@ -1966,77 +1998,18 @@
 	}
 
 	if (unlinked) {
-
-		/*
-		 * Hold z_lock so that we can make sure that the ACL obj
-		 * hasn't changed.  Could have been deleted due to
-		 * zfs_sa_upgrade().
-		 */
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
-		delete_now = may_delete_now && !toobig &&
-		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
-		    acl_obj;
-		VI_UNLOCK(vp);
-	}
-
-	if (delete_now) {
-#ifdef __FreeBSD__
-		panic("zfs_remove: delete_now branch taken");
-#endif
-		if (xattr_obj_unlinked) {
-			ASSERT3U(xzp->z_links, ==, 2);
-			mutex_enter(&xzp->z_lock);
-			xzp->z_unlinked = 1;
-			xzp->z_links = 0;
-			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
-			    &xzp->z_links, sizeof (xzp->z_links), tx);
-			ASSERT3U(error,  ==,  0);
-			mutex_exit(&xzp->z_lock);
-			zfs_unlinked_add(xzp, tx);
-
-			if (zp->z_is_sa)
-				error = sa_remove(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), tx);
-			else
-				error = sa_update(zp->z_sa_hdl,
-				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
-				    sizeof (uint64_t), tx);
-			ASSERT0(error);
-		}
-		VI_LOCK(vp);
-		vp->v_count--;
-		ASSERT0(vp->v_count);
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		zfs_znode_delete(zp, tx);
-	} else if (unlinked) {
-		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
-#ifdef __FreeBSD__
 		vp->v_vflag |= VV_NOSYNC;
-#endif
 	}
 
 	txtype = TX_REMOVE;
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
-	if (realnmp)
-		pn_free(realnmp);
 
-	zfs_dirent_unlock(dl);
-
-	if (!delete_now)
-		VN_RELE(vp);
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -2067,17 +2040,14 @@
  */
 /*ARGSUSED*/
 static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
-    caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
@@ -2097,7 +2067,7 @@
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
@@ -2115,8 +2085,6 @@
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
@@ -2127,10 +2095,11 @@
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
-	    vsecp, &acl_ids)) != 0) {
+	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
+
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
@@ -2138,19 +2107,17 @@
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
-top:
 	*vpp = NULL;
 
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
-	    NULL, NULL)) {
+	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2157,7 +2124,6 @@
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
@@ -2165,6 +2131,7 @@
 	/*
 	 * Add a new entry to the directory.
 	 */
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
@@ -2179,16 +2146,11 @@
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2204,14 +2166,12 @@
 	/*
 	 * Now put new name in parent dir.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
-	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
@@ -2218,7 +2178,7 @@
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
+	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -2246,39 +2206,21 @@
  */
 /*ARGSUSED*/
 static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp;
-	vnode_t		*vp;
+	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-top:
-	zp = NULL;
 
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, NULL)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
-
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
@@ -2288,25 +2230,8 @@
 		goto out;
 	}
 
-	if (vp == cwd) {
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
 	vnevent_rmdir(vp, dvp, name, ct);
 
-	/*
-	 * Grab a lock on the directory to make sure that noone is
-	 * trying to add (or lookup) entries while we are removing it.
-	 */
-	rw_enter(&zp->z_name_lock, RW_WRITER);
-
-	/*
-	 * Grab a lock on the parent pointer to make sure we play well
-	 * with the treewalk and directory rename code.
-	 */
-	rw_enter(&zp->z_parent_lock, RW_WRITER);
-
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -2313,47 +2238,27 @@
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		rw_exit(&zp->z_parent_lock);
-		rw_exit(&zp->z_name_lock);
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
-#endif
 
-	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
-	rw_exit(&zp->z_parent_lock);
-	rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
-#endif
 out:
-	zfs_dirent_unlock(dl);
-
-	VN_RELE(vp);
-
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
@@ -2578,10 +2483,10 @@
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
-				VN_RELE(ZTOV(ezp));
+				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
-			VN_RELE(ZTOV(ezp));
+			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
@@ -2632,7 +2537,8 @@
 
 		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0);
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
@@ -2777,10 +2683,9 @@
 	 * than to determine whether we were asked the question.
 	 */
 
-	mutex_enter(&zp->z_lock);
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
-#ifdef sun
+#ifdef illumos
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
@@ -2792,7 +2697,7 @@
 		links = zp->z_links;
 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
 	vap->va_size = zp->z_size;
-#ifdef sun
+#ifdef illumos
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
@@ -2800,6 +2705,7 @@
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
@@ -2913,7 +2819,6 @@
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
-	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
@@ -3049,7 +2954,6 @@
 		}
 	}
 
-top:
 	attrzp = NULL;
 	aclp = NULL;
 
@@ -3138,7 +3042,6 @@
 		}
 	}
 
-	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
@@ -3212,7 +3115,6 @@
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
@@ -3224,8 +3126,6 @@
 		}
 	}
 
-	mutex_exit(&zp->z_lock);
-
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -3291,6 +3191,11 @@
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+			if (err == 0) {
+				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+				if (err != 0)
+					vrele(ZTOV(attrzp));
+			}
 			if (err)
 				goto out2;
 		}
@@ -3300,7 +3205,7 @@
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3312,7 +3217,7 @@
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
-					VN_RELE(ZTOV(attrzp));
+					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
@@ -3334,7 +3239,6 @@
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
-		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
@@ -3355,7 +3259,6 @@
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
-		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
@@ -3375,12 +3278,9 @@
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
-	if (err) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
 		goto out;
-	}
 
 	count = 0;
 	/*
@@ -3391,10 +3291,8 @@
 	 * updated as a side-effect of calling this function.
 	 */
 
-
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
-	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
@@ -3402,7 +3300,6 @@
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
-		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
@@ -3536,7 +3433,6 @@
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
@@ -3543,7 +3439,6 @@
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
-		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
@@ -3553,7 +3448,7 @@
 	}
 
 	if (attrzp)
-		VN_RELE(ZTOV(attrzp));
+		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
@@ -3565,8 +3460,6 @@
 
 	if (err) {
 		dmu_tx_abort(tx);
-		if (err == ERESTART)
-			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
@@ -3580,101 +3473,236 @@
 	return (err);
 }
 
-typedef struct zfs_zlock {
-	krwlock_t	*zl_rwlock;	/* lock we acquired */
-	znode_t		*zl_znode;	/* znode we held */
-	struct zfs_zlock *zl_next;	/* next in list */
-} zfs_zlock_t;
-
 /*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
  */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
 {
-	zfs_zlock_t *zl;
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
 
-	while ((zl = *zlpp) != NULL) {
-		if (zl->zl_znode != NULL)
-			VN_RELE(ZTOV(zl->zl_znode));
-		rw_exit(zl->zl_rwlock);
-		*zlpp = zl->zl_next;
-		kmem_free(zl, sizeof (*zl));
+	VOP_UNLOCK(tdvp, 0);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK(*tvpp, 0);
+
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK(tdvp, 0);
+		goto relock;
 	}
-}
+	tdzp = VTOZ(tdvp);
 
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t	*zl;
-	znode_t		*zp = tdzp;
-	uint64_t	rootid = zp->z_zfsvfs->z_root;
-	uint64_t	oidp = zp->z_id;
-	krwlock_t	*rwlp = &szp->z_parent_lock;
-	krw_t		rw = RW_WRITER;
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * First pass write-locks szp and compares to zp->z_id.
-	 * Later passes read-lock zp and compare to zp->z_parent.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	do {
-		if (!rw_tryenter(rwlp, rw)) {
-			/*
-			 * Another thread is renaming in this path.
-			 * Note that if we are a WRITER, we don't have any
-			 * parent_locks held yet.
-			 */
-			if (rw == RW_READER && zp->z_id > szp->z_id) {
-				/*
-				 * Drop our locks and restart
-				 */
-				zfs_rename_unlock(&zl);
-				*zlpp = NULL;
-				zp = tdzp;
-				oidp = zp->z_id;
-				rwlp = &szp->z_parent_lock;
-				rw = RW_WRITER;
-				continue;
-			} else {
-				/*
-				 * Wait for other thread to drop its locks
-				 */
-				rw_enter(rwlp, rw);
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
+
+	/*
+	 * Re-resolve tvp, if it disappeared we just carry on.
+	 */
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK(nvp, 0);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK(sdvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			VOP_UNLOCK(*svpp, 0);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
 			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
+			}
+			vput(nvp);
+			goto relock;
 		}
+		*tvpp = nvp;
+	}
 
-		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
-		zl->zl_rwlock = rwlp;
-		zl->zl_znode = NULL;
-		zl->zl_next = *zlpp;
-		*zlpp = zl;
+	return (0);
 
-		if (oidp == szp->z_id)		/* We're a descendant of szp */
-			return (SET_ERROR(EINVAL));
+out:
+	return (error);
+}
 
-		if (oidp == rootid)		/* We've hit the top */
-			return (0);
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
 
-		if (rw == RW_READER) {		/* i.e. not the first pass */
-			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
-			if (error)
-				return (error);
-			zl->zl_znode = zp;
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
+
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
 		}
-		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
-		    &oidp, sizeof (oidp));
-		rwlp = &zp->z_parent_lock;
-		rw = RW_READER;
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
 
-	} while (zp->z_id != sdzp->z_id);
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
 
-	return (0);
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
 }
 
 /*
@@ -3696,181 +3724,95 @@
  */
 /*ARGSUSED*/
 static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr)
 {
-	znode_t		*tdzp, *szp, *tzp;
-	znode_t		*sdzp = VTOZ(sdvp);
-	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
-	zilog_t		*zilog;
-	vnode_t		*realvp;
-	zfs_dirlock_t	*sdl, *tdl;
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
-	zfs_zlock_t	*zl;
-	int		cmp, serr, terr;
+	char		*snm = scnp->cn_nameptr;
+	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
-	int		zflg = 0;
 
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(sdzp);
-	zilog = zfsvfs->z_log;
+	/* Reject renames across filesystems. */
+	if ((*svpp)->v_mount != tdvp->v_mount ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
 
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
 	/*
-	 * Make sure we have the real vp for the target directory.
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
-	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
-		tdvp = realvp;
-
-	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
 	}
 
 	tdzp = VTOZ(tdvp);
-	ZFS_VERIFY_ZP(tdzp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
+	 */
+	ZFS_ENTER(zfsvfs);
+
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
 	}
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
+	}
 
-top:
-	szp = NULL;
-	tzp = NULL;
-	zl = NULL;
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
+	}
 
 	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
 	 */
-	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
-	/*
-	 * Lock source and target directory entries.  To prevent deadlock,
-	 * a lock ordering must be defined.  We lock the directory with
-	 * the smallest object id first, or if it's a tie, the one with
-	 * the lexically first name.
-	 */
-	if (sdzp->z_id < tdzp->z_id) {
-		cmp = -1;
-	} else if (sdzp->z_id > tdzp->z_id) {
-		cmp = 1;
-	} else {
-		/*
-		 * First compare the two name arguments without
-		 * considering any case folding.
-		 */
-		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
-		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
-		ASSERT(error == 0 || !zfsvfs->z_utf8);
-		if (cmp == 0) {
-			/*
-			 * POSIX: "If the old argument and the new argument
-			 * both refer to links to the same existing file,
-			 * the rename() function shall return successfully
-			 * and perform no other action."
-			 */
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-		/*
-		 * If the file system is case-folding, then we may
-		 * have some more checking to do.  A case-folding file
-		 * system is either supporting mixed case sensitivity
-		 * access or is completely case-insensitive.  Note
-		 * that the file system is always case preserving.
-		 *
-		 * In mixed sensitivity mode case sensitive behavior
-		 * is the default.  FIGNORECASE must be used to
-		 * explicitly request case insensitive behavior.
-		 *
-		 * If the source and target names provided differ only
-		 * by case (e.g., a request to rename 'tim' to 'Tim'),
-		 * we will treat this as a special case in the
-		 * case-insensitive mode: as long as the source name
-		 * is an exact match, we will allow this to proceed as
-		 * a name-change request.
-		 */
-		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
-		    flags & FIGNORECASE)) &&
-		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
-		    &error) == 0) {
-			/*
-			 * case preserving rename request, require exact
-			 * name matches
-			 */
-			zflg |= ZCIEXACT;
-			zflg &= ~ZCILOOK;
-		}
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
 
 	/*
-	 * If the source and destination directories are the same, we should
-	 * grab the z_name_lock of that directory only once.
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
 	 */
-	if (sdzp == tdzp) {
-		zflg |= ZHAVELOCK;
-		rw_enter(&sdzp->z_name_lock, RW_READER);
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
 	}
 
-	if (cmp < 0) {
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
-		    ZEXISTS | zflg, NULL, NULL);
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
-	} else {
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, zflg, NULL, NULL);
-		serr = zfs_dirent_lock(&sdl,
-		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
-		    NULL, NULL);
-	}
-
-	if (serr) {
-		/*
-		 * Source entry invalid or not there.
-		 */
-		if (!terr) {
-			zfs_dirent_unlock(tdl);
-			if (tzp)
-				VN_RELE(ZTOV(tzp));
-		}
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		/*
-		 * FreeBSD: In OpenSolaris they only check if rename source is
-		 * ".." here, because "." is handled in their lookup. This is
-		 * not the case for FreeBSD, so we check for "." explicitly.
-		 */
-		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
-			serr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (serr);
-	}
-	if (terr) {
-		zfs_dirent_unlock(sdl);
-		VN_RELE(ZTOV(szp));
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		if (strcmp(tnm, "..") == 0)
-			terr = SET_ERROR(EINVAL);
-		ZFS_EXIT(zfsvfs);
-		return (terr);
-	}
-
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
@@ -3877,17 +3819,26 @@
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
-
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto out;
+		goto unlockout;
 
-	if (ZTOV(szp)->v_type == VDIR) {
+	if ((*svpp)->v_type == VDIR) {
 		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = EINVAL;
+			goto unlockout;
+		}
+
+		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
-		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
-			goto out;
+		if (error = zfs_rename_check(szp, sdzp, tdzp))
+			goto unlockout;
 	}
 
 	/*
@@ -3897,31 +3848,26 @@
 		/*
 		 * Source and target must be the same type.
 		 */
-		if (ZTOV(szp)->v_type == VDIR) {
-			if (ZTOV(tzp)->v_type != VDIR) {
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
-				goto out;
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
 			}
 		} else {
-			if (ZTOV(tzp)->v_type == VDIR) {
+			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
-				goto out;
+				goto unlockout;
 			}
 		}
-		/*
-		 * POSIX dictates that when the source and target
-		 * entries refer to the same file object, rename
-		 * must do nothing and exit without error.
-		 */
-		if (szp->z_id == tzp->z_id) {
-			error = 0;
-			goto out;
-		}
 	}
 
-	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
@@ -3947,34 +3893,18 @@
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (zl != NULL)
-			zfs_rename_unlock(&zl);
-		zfs_dirent_unlock(sdl);
-		zfs_dirent_unlock(tdl);
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		VN_RELE(ZTOV(szp));
-		if (tzp)
-			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
+		goto unlockout;
 	}
 
+
 	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
-		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
@@ -3982,17 +3912,16 @@
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
-			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+			    NULL);
 			if (error == 0) {
-				zfs_log_rename(zilog, tx, TX_RENAME |
-				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
-				    sdl->dl_name, tdzp, tdl->dl_name, szp);
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
-				vn_renamepath(tdvp, ZTOV(szp), tnm,
-				    strlen(tnm));
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
@@ -4006,39 +3935,33 @@
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
-				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
-#ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
-			cache_purge(sdvp);
-			cache_purge(tdvp);
+			cache_purge(*svpp);
+			if (*tvpp != NULL)
+				cache_purge(*tvpp);
+			cache_purge_negative(tdvp);
 		}
-#endif
 	}
 
 	dmu_tx_commit(tx);
-out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
 
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
+	VOP_UNLOCK(*svpp, 0);
+	VOP_UNLOCK(sdvp, 0);
 
-	if (sdzp == tdzp)
-		rw_exit(&sdzp->z_name_lock);
-
-
-	VN_RELE(ZTOV(szp));
-	if (tzp)
-		VN_RELE(ZTOV(tzp));
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+out:				/* original two vnodes are locked */
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
-	ZFS_EXIT(zfsvfs);
-
+	if (*tvpp != NULL)
+		VOP_UNLOCK(*tvpp, 0);
+	if (tdvp != *tvpp)
+		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
@@ -4063,13 +3986,11 @@
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
-	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
@@ -4086,8 +4007,6 @@
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
@@ -4099,11 +4018,11 @@
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
-top:
+
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
@@ -4112,7 +4031,6 @@
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -4119,10 +4037,11 @@
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
+
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
@@ -4136,16 +4055,11 @@
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -4159,13 +4073,11 @@
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
-	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -4173,10 +4085,8 @@
 	/*
 	 * Insert the new object into the directory.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
@@ -4184,7 +4094,7 @@
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
+	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
@@ -4220,13 +4130,11 @@
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
-	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
@@ -4258,11 +4166,8 @@
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	vnode_t		*realvp;
 	int		error;
-	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 
@@ -4272,9 +4177,6 @@
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
-	if (VOP_REALVP(svp, &realvp, ct) == 0)
-		svp = realvp;
-
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
@@ -4284,14 +4186,14 @@
 		return (SET_ERROR(EPERM));
 	}
 
-	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
+	szp = VTOZ(svp);
+	ZFS_VERIFY_ZP(szp);
+
+	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EXDEV));
+		return (SET_ERROR(EPERM));
 	}
 
-	szp = VTOZ(svp);
-	ZFS_VERIFY_ZP(szp);
-
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
@@ -4309,8 +4211,6 @@
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
@@ -4335,11 +4235,10 @@
 		return (error);
 	}
 
-top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -4350,32 +4249,22 @@
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	error = zfs_link_create(dl, szp, tx, 0);
+	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
-
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
@@ -4387,243 +4276,8 @@
 	return (error);
 }
 
-#ifdef sun
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-		size_t *lenp, int flags, cred_t *cr)
-{
-	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
-	return (0);
-}
 
-/*
- * Push a page out to disk, klustering if possible.
- *
- *	IN:	vp	- file to push page to.
- *		pp	- page to push.
- *		flags	- additional flags.
- *		cr	- credentials of caller.
- *
- *	OUT:	offp	- start of range pushed.
- *		lenp	- len of range pushed.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * NOTE: callers must have locked the page to be pushed.  On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
-		size_t *lenp, int flags, cred_t *cr)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	dmu_tx_t	*tx;
-	u_offset_t	off, koff;
-	size_t		len, klen;
-	int		err;
-
-	off = pp->p_offset;
-	len = PAGESIZE;
-	/*
-	 * If our blocksize is bigger than the page size, try to kluster
-	 * multiple pages so that we write a full block (thus avoiding
-	 * a read-modify-write).
-	 */
-	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
-		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
-		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
-		ASSERT(koff <= zp->z_size);
-		if (koff + klen > zp->z_size)
-			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
-		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
-	}
-	ASSERT3U(btop(len), ==, btopr(len));
-
-	/*
-	 * Can't push pages past end-of-file.
-	 */
-	if (off >= zp->z_size) {
-		/* ignore all pages */
-		err = 0;
-		goto out;
-	} else if (off + len > zp->z_size) {
-		int npages = btopr(zp->z_size - off);
-		page_t *trunc;
-
-		page_list_break(&pp, &trunc, npages);
-		/* ignore pages past end of file */
-		if (trunc)
-			pvn_write_done(trunc, flags);
-		len = zp->z_size - off;
-	}
-
-	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
-	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
-		err = SET_ERROR(EDQUOT);
-		goto out;
-	}
-top:
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_write(tx, zp->z_id, off, len);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
-	if (err != 0) {
-		if (err == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
-	if (zp->z_blksz <= PAGESIZE) {
-		caddr_t va = zfs_map_page(pp, S_READ);
-		ASSERT3U(len, <=, PAGESIZE);
-		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
-		zfs_unmap_page(pp, va);
-	} else {
-		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
-	}
-
-	if (err == 0) {
-		uint64_t mtime[2], ctime[2];
-		sa_bulk_attr_t bulk[3];
-		int count = 0;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-		    &mtime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-		    &zp->z_pflags, 8);
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
-	}
-	dmu_tx_commit(tx);
-
-out:
-	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
-	if (offp)
-		*offp = off;
-	if (lenp)
-		*lenp = len;
-
-	return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- *	IN:	vp	- vnode of file to push page data to.
- *		off	- position in file to put data.
- *		len	- amount of data to write.
- *		flags	- flags to control the operation.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
 /*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		*pp;
-	size_t		io_len;
-	u_offset_t	io_off;
-	uint_t		blksz;
-	rl_t		*rl;
-	int		error = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * Align this request to the file block size in case we kluster.
-	 * XXX - this can result in pretty aggresive locking, which can
-	 * impact simultanious read/write access.  One option might be
-	 * to break up long requests (len == 0) into block-by-block
-	 * operations to get narrower locking.
-	 */
-	blksz = zp->z_blksz;
-	if (ISP2(blksz))
-		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
-	else
-		io_off = 0;
-	if (len > 0 && ISP2(blksz))
-		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
-	else
-		io_len = 0;
-
-	if (io_len == 0) {
-		/*
-		 * Search the entire vp list for pages >= io_off.
-		 */
-		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
-		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
-		goto out;
-	}
-	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
-	if (off > zp->z_size) {
-		/* past end of file */
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
-
-	for (off = io_off; io_off < off + len; io_off += io_len) {
-		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
-			pp = page_lookup(vp, io_off,
-			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
-		} else {
-			pp = page_lookup_nowait(vp, io_off,
-			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
-		}
-
-		if (pp != NULL && pvn_getdirty(pp, flags)) {
-			int err;
-
-			/*
-			 * Found a dirty page to push
-			 */
-			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
-			if (err)
-				error = err;
-		} else {
-			io_len = PAGESIZE;
-		}
-	}
-out:
-	zfs_range_unlock(rl);
-	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zfsvfs->z_log, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* sun */
-
-/*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
@@ -4638,21 +4292,18 @@
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		vrecycle(vp, curthread);
+		vrecycle(vp);
 		return;
 	}
 
-	mutex_enter(&zp->z_lock);
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
-		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		vrecycle(vp, curthread);
+		vrecycle(vp);
 		return;
 	}
-	mutex_exit(&zp->z_lock);
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
@@ -4663,11 +4314,9 @@
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
@@ -4674,424 +4323,7 @@
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
-#ifdef sun
-/*
- * Bounds-check the seek operation.
- *
- *	IN:	vp	- vnode seeking within
- *		ooff	- old file offset
- *		noffp	- pointer to new file offset
- *		ct	- caller context
- *
- *	RETURN:	0 on success, EINVAL if new offset invalid.
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
-    caller_context_t *ct)
-{
-	if (vp->v_type == VDIR)
-		return (0);
-	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
 
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
-    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * We are following the UFS semantics with respect to mapcnt
-	 * here: If we see that the file is mapped already, then we will
-	 * return an error, but we don't worry about races between this
-	 * function and zfs_map().
-	 */
-	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-	ZFS_EXIT(zfsvfs);
-	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data.  For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list.  Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
-    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
-	znode_t *zp = VTOZ(vp);
-	page_t *pp, *cur_pp;
-	objset_t *os = zp->z_zfsvfs->z_os;
-	u_offset_t io_off, total;
-	size_t io_len;
-	int err;
-
-	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
-		/*
-		 * We only have a single page, don't bother klustering
-		 */
-		io_off = off;
-		io_len = PAGESIZE;
-		pp = page_create_va(vp, io_off, io_len,
-		    PG_EXCL | PG_WAIT, seg, addr);
-	} else {
-		/*
-		 * Try to find enough pages to fill the page list
-		 */
-		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
-		    &io_len, off, plsz, 0);
-	}
-	if (pp == NULL) {
-		/*
-		 * The page already exists, nothing to do here.
-		 */
-		*pl = NULL;
-		return (0);
-	}
-
-	/*
-	 * Fill the pages in the kluster.
-	 */
-	cur_pp = pp;
-	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
-		caddr_t va;
-
-		ASSERT3U(io_off, ==, cur_pp->p_offset);
-		va = zfs_map_page(cur_pp, S_WRITE);
-		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
-		    DMU_READ_PREFETCH);
-		zfs_unmap_page(cur_pp, va);
-		if (err) {
-			/* On error, toss the entire kluster */
-			pvn_read_done(pp, B_ERROR);
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = SET_ERROR(EIO);
-			return (err);
-		}
-		cur_pp = cur_pp->p_next;
-	}
-
-	/*
-	 * Fill in the page list array from the kluster starting
-	 * from the desired offset `off'.
-	 * NOTE: the page list will always be null terminated.
-	 */
-	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
-	ASSERT(pl == NULL || (*pl)->p_offset == off);
-
-	return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array.  If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]).  These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- *	IN:	vp	- vnode of file to get data from.
- *		off	- position in file to get data from.
- *		len	- amount of data to retrieve.
- *		plsz	- length of provided page list.
- *		seg	- segment to obtain pages for.
- *		addr	- virtual address of fault.
- *		rw	- mode of created pages.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	OUT:	protp	- protection mode of created pages.
- *		pl	- list of pages created.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
-    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
-    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		**pl0 = pl;
-	int		err = 0;
-
-	/* we do our own caching, faultahead is unnecessary */
-	if (pl == NULL)
-		return (0);
-	else if (len > plsz)
-		len = plsz;
-	else
-		len = P2ROUNDUP(len, PAGESIZE);
-	ASSERT(plsz >= len);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (protp)
-		*protp = PROT_ALL;
-
-	/*
-	 * Loop through the requested range [off, off + len) looking
-	 * for pages.  If we don't find a page, we will need to create
-	 * a new page and fill it with data from the file.
-	 */
-	while (len > 0) {
-		if (*pl = page_lookup(vp, off, SE_SHARED))
-			*(pl+1) = NULL;
-		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
-			goto out;
-		while (*pl) {
-			ASSERT3U((*pl)->p_offset, ==, off);
-			off += PAGESIZE;
-			addr += PAGESIZE;
-			if (len > 0) {
-				ASSERT3U(len, >=, PAGESIZE);
-				len -= PAGESIZE;
-			}
-			ASSERT3U(plsz, >=, PAGESIZE);
-			plsz -= PAGESIZE;
-			pl++;
-		}
-	}
-
-	/*
-	 * Fill out the page array with any pages already in the cache.
-	 */
-	while (plsz > 0 &&
-	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
-			off += PAGESIZE;
-			plsz -= PAGESIZE;
-	}
-out:
-	if (err) {
-		/*
-		 * Release any pages we have previously locked.
-		 */
-		while (pl > pl0)
-			page_unlock(*--pl);
-	} else {
-		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	}
-
-	*pl = NULL;
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-/*
- * Request a memory map for a section of a file.  This code interacts
- * with common code and the VM system as follows:
- *
- * - common code calls mmap(), which ends up in smmap_common()
- * - this calls VOP_MAP(), which takes you into (say) zfs
- * - zfs_map() calls as_map(), passing segvn_create() as the callback
- * - segvn_create() creates the new segment and calls VOP_ADDMAP()
- * - zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	segvn_crargs_t	vn_a;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((prot & PROT_WRITE) && (zp->z_pflags &
-	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if ((prot & (PROT_READ | PROT_EXEC)) &&
-	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EACCES));
-	}
-
-	if (vp->v_flag & VNOMAP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOSYS));
-	}
-
-	if (off < 0 || len > MAXOFFSET_T - off) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (vp->v_type != VREG) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENODEV));
-	}
-
-	/*
-	 * If file is locked, disallow mapping.
-	 */
-	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	as_rangelock(as);
-	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (error != 0) {
-		as_rangeunlock(as);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vn_a.vp = vp;
-	vn_a.offset = (u_offset_t)off;
-	vn_a.type = flags & MAP_TYPE;
-	vn_a.prot = prot;
-	vn_a.maxprot = maxprot;
-	vn_a.cred = cr;
-	vn_a.amp = NULL;
-	vn_a.flags = flags & ~MAP_TYPE;
-	vn_a.szc = 0;
-	vn_a.lgrp_mem_policy_flags = 0;
-
-	error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
-	as_rangeunlock(as);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
-	return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * 	open()
- * 	mmap()
- * 	<modify memory>
- * 	munmap()
- * 	close()
- * 	<time lapse>
- * 	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
-	return (0);
-}
-
-/*
- * Free or allocate space in a file.  Currently, this function only
- * supports the `F_FREESP' command.  However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- *	IN:	vp	- vnode of file to free data in.
- *		cmd	- action to take (only F_FREESP supported).
- *		bfp	- section of file to free/alloc.
- *		flag	- current file open mode flags.
- *		offset	- current file offset.
- *		cr	- credentials of caller [UNUSED].
- *		ct	- caller context.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
-    offset_t offset, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	off, len;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (cmd != F_FREESP) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (error = convoff(vp, bfp, 0, offset)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (bfp->l_len < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	off = bfp->l_start;
-	len = bfp->l_len; /* 0 means from off to end of file */
-
-	error = zfs_freesp(zp, off, len, flag, TRUE);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* sun */
-
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
@@ -5167,7 +4399,6 @@
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
-	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
@@ -5178,7 +4409,7 @@
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
-#ifdef sun
+#ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
@@ -5185,13 +4416,12 @@
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
-		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+		error = zfs_dirent_lookup(zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
-			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
-			VN_RELE(ZTOV(xzp));
+			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
@@ -5216,16 +4446,16 @@
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
-#endif	/* sun */
+#endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
-#ifdef sun
+#ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
-#endif	/* sun */
+#endif
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
@@ -5262,7 +4492,7 @@
 }
 
 /*ARGSUSED*/
-static int
+int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
@@ -5284,355 +4514,7 @@
 	return (error);
 }
 
-#ifdef sun
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10);	/* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read.  This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17);	/* 128K */
-
-/*ARGSUSED*/
 static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int max_blksz = zfsvfs->z_max_blksz;
-	uio_t *uio = &xuio->xu_uio;
-	ssize_t size = uio->uio_resid;
-	offset_t offset = uio->uio_loffset;
-	int blksz;
-	int fullblk, i;
-	arc_buf_t *abuf;
-	ssize_t maxsize;
-	int preamble, postamble;
-
-	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	switch (ioflag) {
-	case UIO_WRITE:
-		/*
-		 * Loan out an arc_buf for write if write size is bigger than
-		 * max_blksz, and the file's block size is also max_blksz.
-		 */
-		blksz = max_blksz;
-		if (size < blksz || zp->z_blksz != blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		/*
-		 * Caller requests buffers for write before knowing where the
-		 * write offset might be (e.g. NFS TCP write).
-		 */
-		if (offset == -1) {
-			preamble = 0;
-		} else {
-			preamble = P2PHASE(offset, blksz);
-			if (preamble) {
-				preamble = blksz - preamble;
-				size -= preamble;
-			}
-		}
-
-		postamble = P2PHASE(size, blksz);
-		size -= postamble;
-
-		fullblk = size / blksz;
-		(void) dmu_xuio_init(xuio,
-		    (preamble != 0) + fullblk + (postamble != 0));
-		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
-		    int, postamble, int,
-		    (preamble != 0) + fullblk + (postamble != 0));
-
-		/*
-		 * Have to fix iov base/len for partial buffers.  They
-		 * currently represent full arc_buf's.
-		 */
-		if (preamble) {
-			/* data begins in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf,
-			    blksz - preamble, preamble);
-		}
-
-		for (i = 0; i < fullblk; i++) {
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
-		}
-
-		if (postamble) {
-			/* data ends in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
-		}
-		break;
-	case UIO_READ:
-		/*
-		 * Loan out an arc_buf for read if the read size is larger than
-		 * the current file block size.  Block alignment is not
-		 * considered.  Partial arc_buf will be loaned out for read.
-		 */
-		blksz = zp->z_blksz;
-		if (blksz < zcr_blksz_min)
-			blksz = zcr_blksz_min;
-		if (blksz > zcr_blksz_max)
-			blksz = zcr_blksz_max;
-		/* avoid potential complexity of dealing with it */
-		if (blksz > max_blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-
-		maxsize = zp->z_size - uio->uio_loffset;
-		if (size > maxsize)
-			size = maxsize;
-
-		if (size < blksz || vn_has_cached_data(vp)) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-		break;
-	default:
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	uio->uio_extflg = UIO_XUIO;
-	XUIO_XUZC_RW(xuio) = ioflag;
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
-	int i;
-	arc_buf_t *abuf;
-	int ioflag = XUIO_XUZC_RW(xuio);
-
-	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
-	i = dmu_xuio_cnt(xuio);
-	while (i-- > 0) {
-		abuf = dmu_xuio_arcbuf(xuio, i);
-		/*
-		 * if abuf == NULL, it must be a write buffer
-		 * that has been returned in zfs_write().
-		 */
-		if (abuf)
-			dmu_return_arcbuf(abuf);
-		ASSERT(abuf || ioflag == UIO_WRITE);
-	}
-
-	dmu_xuio_fini(xuio);
-	return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
-	return (SET_ERROR(EINVAL));
-}
-
-static int
-zfs_isdir()
-{
-	return (SET_ERROR(EISDIR));
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .error = zfs_isdir },
-	VOPNAME_WRITE,		{ .error = zfs_isdir },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_READ,		{ .vop_read = zfs_read },
-	VOPNAME_WRITE,		{ .vop_write = zfs_write },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
-	VOPNAME_SPACE,		{ .vop_space = zfs_space },
-	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
-	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
-	VOPNAME_MAP,		{ .vop_map = zfs_map },
-	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
-	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
-	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
-	NULL,			NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * special share hidden files vnode operations template
- */
-vnodeops_t *zfs_sharevnodeops;
-const fs_operation_def_t zfs_sharevnodeops_template[] = {
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- *
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- *	VOP_MKDIR()
- *	VOP_SYMLINK()
- *
- * Note that there are other restrictions embedded in:
- *	zfs_create()	- restrict type to VREG
- *	zfs_link()	- no links into/out of attribute space
- *	zfs_rename()	- no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
-	VOPNAME_OPEN,		{ .vop_open = zfs_open },
-	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
-	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
-	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
-	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
-	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
-	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
-	VOPNAME_CREATE,		{ .vop_create = zfs_create },
-	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
-	VOPNAME_LINK,		{ .vop_link = zfs_link },
-	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
-	VOPNAME_MKDIR,		{ .error = zfs_inval },
-	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
-	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
-	VOPNAME_SYMLINK,	{ .error = zfs_inval },
-	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_FID,		{ .vop_fid = zfs_fid },
-	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
-	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
-	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
-	NULL,			NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
-	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
-	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
-	NULL,			NULL
-};
-#endif	/* sun */
-
-static int
-ioflags(int ioflags)
-{
-	int flags = 0;
-
-	if (ioflags & IO_APPEND)
-		flags |= FAPPEND;
-	if (ioflags & IO_NDELAY)
-        	flags |= FNONBLOCK;
-	if (ioflags & IO_SYNC)
-		flags |= (FSYNC | FDSYNC | FRSYNC);
-
-	return (flags);
-}
-
-static int
 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
 {
 	znode_t *zp = VTOZ(vp);
@@ -5679,7 +4561,7 @@
 	mfirst = m[reqstart];
 	mlast = m[reqstart + reqsize - 1];
 
-	VM_OBJECT_LOCK(object);
+	zfs_vmobject_wlock(object);
 
 	for (i = 0; i < reqstart; i++) {
 		vm_page_lock(m[i]);
@@ -5695,9 +4577,9 @@
 	if (mreq->valid && reqsize == 1) {
 		if (mreq->valid != VM_PAGE_BITS_ALL)
 			vm_page_zero_invalid(mreq, TRUE);
-		VM_OBJECT_UNLOCK(object);
+		zfs_vmobject_wunlock(object);
 		ZFS_EXIT(zfsvfs);
-		return (VM_PAGER_OK);
+		return (zfs_vm_pagerret_ok);
 	}
 
 	PCPU_INC(cnt.v_vnodein);
@@ -5711,9 +4593,9 @@
 				vm_page_unlock(m[i]);
 			}
 		}
-		VM_OBJECT_UNLOCK(object);
+		zfs_vmobject_wunlock(object);
 		ZFS_EXIT(zfsvfs);
-		return (VM_PAGER_BAD);
+		return (zfs_vm_pagerret_bad);
 	}
 
 	lsize = PAGE_SIZE;
@@ -5720,7 +4602,7 @@
 	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
 		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
 
-	VM_OBJECT_UNLOCK(object);
+	zfs_vmobject_wunlock(object);
 
 	for (i = reqstart; i < reqstart + reqsize; i++) {
 		size = PAGE_SIZE;
@@ -5736,7 +4618,7 @@
 			break;
 	}
 
-	VM_OBJECT_LOCK(object);
+	zfs_vmobject_wlock(object);
 
 	for (i = reqstart; i < reqstart + reqsize; i++) {
 		if (!error)
@@ -5746,11 +4628,11 @@
 			vm_page_readahead_finish(m[i]);
 	}
 
-	VM_OBJECT_UNLOCK(object);
+	zfs_vmobject_wunlock(object);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
-	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
 }
 
 static int
@@ -5768,6 +4650,162 @@
 }
 
 static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+    int *rtvals)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	rl_t		*rl;
+	dmu_tx_t	*tx;
+	struct sf_buf	*sf;
+	vm_object_t	object;
+	vm_page_t	m;
+	caddr_t		va;
+	size_t		tocopy;
+	size_t		lo_len;
+	vm_ooffset_t	lo_off;
+	vm_ooffset_t	off;
+	uint_t		blksz;
+	int		ncount;
+	int		pcount;
+	int		err;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	object = vp->v_object;
+	pcount = btoc(len);
+	ncount = pcount;
+
+	KASSERT(ma[0]->object == object, ("mismatching object"));
+	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+	for (i = 0; i < pcount; i++)
+		rtvals[i] = zfs_vm_pagerret_error;
+
+	off = IDX_TO_OFF(ma[0]->pindex);
+	blksz = zp->z_blksz;
+	lo_off = rounddown(off, blksz);
+	lo_len = roundup(len + (off - lo_off), blksz);
+	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
+
+	zfs_vmobject_wlock(object);
+	if (len + off > object->un_pager.vnp.vnp_size) {
+		if (object->un_pager.vnp.vnp_size > off) {
+			int pgoff;
+
+			len = object->un_pager.vnp.vnp_size - off;
+			ncount = btoc(len);
+			if ((pgoff = (int)len & PAGE_MASK) != 0) {
+				/*
+				 * If the object is locked and the following
+				 * conditions hold, then the page's dirty
+				 * field cannot be concurrently changed by a
+				 * pmap operation.
+				 */
+				m = ma[ncount - 1];
+				vm_page_assert_sbusied(m);
+				KASSERT(!pmap_page_is_write_mapped(m),
+				    ("zfs_putpages: page %p is not read-only", m));
+				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+				    pgoff);
+			}
+		} else {
+			len = 0;
+			ncount = 0;
+		}
+		if (ncount < pcount) {
+			for (i = ncount; i < pcount; i++) {
+				rtvals[i] = zfs_vm_pagerret_bad;
+			}
+		}
+	}
+	zfs_vmobject_wunlock(object);
+
+	if (ncount == 0)
+		goto out;
+
+	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+		goto out;
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	if (zp->z_blksz < PAGE_SIZE) {
+		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+			va = zfs_map_page(ma[i], &sf);
+			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+			zfs_unmap_page(sf);
+		}
+	} else {
+		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+	}
+
+	if (err == 0) {
+		uint64_t mtime[2], ctime[2];
+		sa_bulk_attr_t bulk[3];
+		int count = 0;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+		    &zp->z_pflags, 8);
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+		    B_TRUE);
+		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+
+		zfs_vmobject_wlock(object);
+		for (i = 0; i < ncount; i++) {
+			rtvals[i] = zfs_vm_pagerret_ok;
+			vm_page_undirty(ma[i]);
+		}
+		zfs_vmobject_wunlock(object);
+		PCPU_INC(cnt.v_vnodeout);
+		PCPU_ADD(cnt.v_vnodepgsout, ncount);
+	}
+	dmu_tx_commit(tx);
+
+out:
+	zfs_range_unlock(rl);
+	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	ZFS_EXIT(zfsvfs);
+	return (rtvals[0]);
+}
+
+int
+zfs_freebsd_putpages(ap)
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_sync;
+		int *a_rtvals;
+		vm_ooffset_t a_offset;
+	} */ *ap;
+{
+
+	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+	    ap->a_rtvals));
+}
+
+static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
@@ -5840,6 +4878,21 @@
 }
 
 static int
+ioflags(int ioflags)
+{
+	int flags = 0;
+
+	if (ioflags & IO_APPEND)
+		flags |= FAPPEND;
+	if (ioflags & IO_NDELAY)
+		flags |= FNONBLOCK;
+	if (ioflags & IO_SYNC)
+		flags |= (FSYNC | FDSYNC | FRSYNC);
+
+	return (flags);
+}
+
+static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
@@ -5930,6 +4983,23 @@
 }
 
 static int
+zfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	zfsvfs_t *zfsvfs;
+
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	if (zfsvfs->z_use_namecache)
+		return (vfs_cache_lookup(ap));
+	else
+		return (zfs_freebsd_lookup(ap));
+}
+
+static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
@@ -5938,17 +5008,23 @@
 		struct vattr *a_vap;
 	} */ *ap;
 {
+	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
-	int mode;
+	int error, mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
-	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
-	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
+	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
+	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+	return (error);
 }
 
 static int
@@ -5962,8 +5038,8 @@
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
-	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred, NULL, 0));
+	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -5982,7 +5058,7 @@
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred, NULL, 0, NULL));
+	    ap->a_cnp->cn_cred));
 }
 
 static int
@@ -5997,7 +5073,7 @@
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
@@ -6052,6 +5128,14 @@
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
+	XVA_SET_REQ(&xvap, XAT_READONLY);
+	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+	XVA_SET_REQ(&xvap, XAT_SYSTEM);
+	XVA_SET_REQ(&xvap, XAT_HIDDEN);
+	XVA_SET_REQ(&xvap, XAT_REPARSE);
+	XVA_SET_REQ(&xvap, XAT_OFFLINE);
+	XVA_SET_REQ(&xvap, XAT_SPARSE);
+
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
@@ -6067,8 +5151,23 @@
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
+	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
+	FLAG_CHECK(UF_READONLY, XAT_READONLY,
+	    xvap.xva_xoptattrs.xoa_readonly);
+	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+	    xvap.xva_xoptattrs.xoa_system);
+	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+	    xvap.xva_xoptattrs.xoa_hidden);
+	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+	    xvap.xva_xoptattrs.xoa_reparse);
+	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+	    xvap.xva_xoptattrs.xoa_offline);
+	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+	    xvap.xva_xoptattrs.xoa_sparse);
+
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
@@ -6106,7 +5205,16 @@
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
-		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
+		/*
+		 * XXX KDM 
+		 * We need to figure out whether it makes sense to allow
+		 * UF_REPARSE through, since we don't really have other
+		 * facilities to handle reparse points and zfs_setattr()
+		 * doesn't currently allow setting that attribute anyway.
+		 */
+		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+		     UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
@@ -6158,8 +5266,22 @@
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
+		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
+		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+		    xvap.xva_xoptattrs.xoa_readonly);
+		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+		    xvap.xva_xoptattrs.xoa_system);
+		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+		    xvap.xva_xoptattrs.xoa_hidden);
+		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+		    xvap.xva_xoptattrs.xoa_hidden);
+		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+		    xvap.xva_xoptattrs.xoa_offline);
+		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
@@ -6185,17 +5307,14 @@
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
-	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
-	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
+	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+	    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
-	if (tdvp == tvp)
-		VN_RELE(tdvp);
-	else
-		VN_URELE(tdvp);
-	if (tvp)
-		VN_URELE(tvp);
-	VN_RELE(fdvp);
-	VN_RELE(fvp);
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp != NULL)
+		vrele(tvp);
 
 	return (error);
 }
@@ -6243,10 +5362,15 @@
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
+	vnode_t *vp = ap->a_vp;
+	vnode_t *tdvp = ap->a_tdvp;
 
+	if (tdvp->v_mount != vp->v_mount)
+		return (EXDEV);
+
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
+	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
@@ -6443,7 +5567,7 @@
 	}
 
 	flags = FREAD;
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
@@ -6511,18 +5635,20 @@
 		return (error);
 	}
 
-	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
+	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
-	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
+
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
@@ -6578,7 +5704,7 @@
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
@@ -6592,7 +5718,7 @@
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
-		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
+		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
@@ -6659,7 +5785,7 @@
 		return (error);
 	}
 
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
@@ -6769,6 +5895,9 @@
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
+	if (ap->a_aclp == NULL)
+		return (EINVAL);
+
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
@@ -6812,6 +5941,87 @@
 	return (EOPNOTSUPP);
 }
 
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+	vnode_t *covered_vp;
+	vnode_t *vp = ap->a_vp;;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *zp = VTOZ(vp);
+	int ltype;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/*
+	 * If we are a snapshot mounted under .zfs, run the operation
+	 * on the covered vnode.
+	 */
+	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+		char name[MAXNAMLEN + 1];
+		znode_t *dzp;
+		size_t len;
+
+		error = zfs_znode_parent_and_name(zp, &dzp, name);
+		if (error == 0) {
+			len = strlen(name);
+			if (*ap->a_buflen < len)
+				error = SET_ERROR(ENOMEM);
+		}
+		if (error == 0) {
+			*ap->a_buflen -= len;
+			bcopy(name, ap->a_buf + *ap->a_buflen, len);
+			*ap->a_vpp = ZTOV(dzp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ZFS_EXIT(zfsvfs);
+
+	covered_vp = vp->v_mount->mnt_vnodecovered;
+	vhold(covered_vp);
+	ltype = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+	error = vget(covered_vp, LK_SHARED, curthread);
+	vdrop(covered_vp);
+	if (error == 0) {
+		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+		    ap->a_buf, ap->a_buflen);
+		vput(covered_vp);
+	}
+	vn_lock(vp, ltype | LK_RETRY);
+	if ((vp->v_iflag & VI_DOOMED) != 0)
+		error = SET_ERROR(ENOENT);
+	return (error);
+}
+
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+	struct vop_lock1_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		char *file;
+		int line;
+	} */ *ap;
+{
+	vnode_t *vp;
+	znode_t *zp;
+	int err;
+
+	err = vop_stdlock(ap);
+	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
+		vp = ap->a_vp;
+		zp = vp->v_data;
+		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
+		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
+			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
+	}
+	return (err);
+}
+#endif
+
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
@@ -6821,12 +6031,8 @@
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
-	.vop_lookup =		vfs_cache_lookup,
+	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
-#else
-	.vop_lookup =		zfs_freebsd_lookup,
-#endif
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
@@ -6856,6 +6062,11 @@
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
+	.vop_putpages =		zfs_freebsd_putpages,
+	.vop_vptocnp =		zfs_vptocnp,
+#ifdef DIAGNOSTIC
+	.vop_lock1 =		zfs_lock,
+#endif
 };
 
 struct vop_vector zfs_fifoops = {

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
@@ -55,6 +56,7 @@
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
@@ -68,8 +70,8 @@
 #include "zfs_comutil.h"
 
 /* Used by fstat(1). */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
-    "sizeof(znode_t)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+    SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -114,43 +116,15 @@
 extern struct vop_vector zfs_fifoops;
 extern struct vop_vector zfs_shareops;
 
-/*
- * XXX: We cannot use this function as a cache constructor, because
- *      there is one global cache for all file systems and we need
- *      to pass vfsp here, which is not possible, because argument
- *      'cdrarg' is defined at kmem_cache_create() time.
- */
-/*ARGSUSED*/
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	znode_t *zp = buf;
-	vnode_t *vp;
-	vfs_t *vfsp = arg;
-	int error;
 
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
-	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 
-	if (vfsp != NULL) {
-		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
-		if (error != 0 && (kmflags & KM_NOSLEEP))
-			return (-1);
-		ASSERT(error == 0);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-		zp->z_vnode = vp;
-		vp->v_data = (caddr_t)zp;
-		VN_LOCK_AREC(vp);
-		VN_LOCK_ASHARE(vp);
-	} else {
-		zp->z_vnode = NULL;
-	}
-
 	list_link_init(&zp->z_link_node);
 
-	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -157,8 +131,8 @@
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
-	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
+	zp->z_vnode = NULL;
 	zp->z_moved = 0;
 	return (0);
 }
@@ -170,17 +144,12 @@
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
-	ASSERT(ZTOV(zp) == NULL);
-	vn_free(ZTOV(zp));
+	ASSERT3P(zp->z_vnode, ==, NULL);
 	ASSERT(!list_link_active(&zp->z_link_node));
-	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_parent_lock);
-	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
@@ -196,7 +165,7 @@
 } znode_move_stats;
 #endif	/* ZNODE_STATS */
 
-#ifdef sun
+#ifdef illumos
 static void
 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 {
@@ -301,7 +270,7 @@
 	 * can safely ensure that the filesystem is not and will not be
 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
 	 */
-	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
 	if (zfsvfs->z_unmounted) {
 		ZFS_EXIT(zfsvfs);
 		rw_exit(&zfsvfs_lock);
@@ -367,7 +336,7 @@
 
 	return (KMEM_CBRC_YES);
 }
-#endif /* sun */
+#endif /* illumos */
 
 void
 zfs_znode_init(void)
@@ -378,7 +347,7 @@
 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
-	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(znode_cache, zfs_znode_move);
 }
@@ -386,12 +355,12 @@
 void
 zfs_znode_fini(void)
 {
-#ifdef sun
+#ifdef illumos
 	/*
 	 * Cleanup vfs & vnode ops
 	 */
 	zfs_remove_op_tables();
-#endif	/* sun */
+#endif
 
 	/*
 	 * Cleanup zcache
@@ -402,7 +371,7 @@
 	rw_destroy(&zfsvfs_lock);
 }
 
-#ifdef sun
+#ifdef illumos
 struct vnodeops *zfs_dvnodeops;
 struct vnodeops *zfs_fvnodeops;
 struct vnodeops *zfs_symvnodeops;
@@ -494,7 +463,7 @@
 
 	return (error);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 int
 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
@@ -502,7 +471,6 @@
 	zfs_acl_ids_t acl_ids;
 	vattr_t vattr;
 	znode_t *sharezp;
-	vnode_t *vp, vnode;
 	znode_t *zp;
 	int error;
 
@@ -513,7 +481,6 @@
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
 	sharezp->z_moved = 0;
 	sharezp->z_unlinked = 0;
@@ -521,12 +488,6 @@
 	sharezp->z_zfsvfs = zfsvfs;
 	sharezp->z_is_sa = zfsvfs->z_use_sa;
 
-	sharezp->z_vnode = &vnode;
-	vnode.v_data = sharezp;
-
-	vp = ZTOV(sharezp);
-	vp->v_type = VDIR;
-
 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids));
 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
@@ -537,12 +498,7 @@
 	zfsvfs->z_shares_dir = sharezp->z_id;
 
 	zfs_acl_ids_free(&acl_ids);
-	ZTOV(sharezp)->v_data = NULL;
-	ZTOV(sharezp)->v_count = 0;
-	ZTOV(sharezp)->v_holdcnt = 0;
-	zp->z_vnode = NULL;
 	sa_handle_destroy(sharezp->z_sa_hdl);
-	sharezp->z_vnode = NULL;
 	kmem_cache_free(znode_cache, sharezp);
 
 	return (error);
@@ -595,8 +551,6 @@
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
-	mutex_enter(&zp->z_lock);
-
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
@@ -610,12 +564,12 @@
 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	/*
-	 * Slap on VROOT if we are the root znode
+	 * Slap on VROOT if we are the root znode unless we are the root
+	 * node of a snapshot mounted under .zfs.
 	 */
-	if (zp->z_id == zfsvfs->z_root)
+	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
 		ZTOV(zp)->v_flag |= VROOT;
 
-	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
@@ -658,11 +612,20 @@
 	uint64_t parent;
 	sa_bulk_attr_t bulk[9];
 	int count = 0;
+	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
 
-	ASSERT(zp->z_dirlocks == NULL);
+	KASSERT(curthread->td_vp_reserv > 0,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+	if (error != 0) {
+		kmem_cache_free(znode_cache, zp);
+		return (NULL);
+	}
+	zp->z_vnode = vp;
+	vp->v_data = zp;
+
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	zp->z_moved = 0;
 
@@ -716,7 +679,7 @@
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
-#ifdef sun
+#ifdef illumos
 	case VBLK:
 	case VCHR:
 		{
@@ -727,12 +690,12 @@
 			vp->v_rdev = zfs_cmpldev(rdev);
 		}
 		break;
-#endif	/* sun */
+#endif
 	case VFIFO:
-#ifdef sun
+#ifdef illumos
 	case VSOCK:
 	case VDOOR:
-#endif	/* sun */
+#endif
 		vp->v_op = &zfs_fifoops;
 		break;
 	case VREG:
@@ -741,7 +704,7 @@
 			vp->v_op = &zfs_shareops;
 		}
 		break;
-#ifdef sun
+#ifdef illumos
 	case VLNK:
 		vn_setops(vp, zfs_symvnodeops);
 		break;
@@ -748,10 +711,8 @@
 	default:
 		vn_setops(vp, zfs_evnodeops);
 		break;
-#endif	/* sun */
+#endif
 	}
-	if (vp->v_type != VFIFO)
-		VN_LOCK_ASHARE(vp);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -763,7 +724,17 @@
 	zp->z_zfsvfs = zfsvfs;
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
+	/*
+	 * Acquire vnode lock before making it available to the world.
+	 */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VN_LOCK_AREC(vp);
+	if (vp->v_type != VFIFO)
+		VN_LOCK_ASHARE(vp);
+
+#ifdef illumos
 	VFS_HOLD(zfsvfs->z_vfs);
+#endif
 	return (zp);
 }
 
@@ -815,7 +786,7 @@
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		obj = 0;
-		gethrestime(&now);
+		vfs_timestamp(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
@@ -834,10 +805,9 @@
 	 */
 	if (vap->va_type == VDIR) {
 		if (zfsvfs->z_replay) {
-			err = zap_create_claim_norm(zfsvfs->z_os, obj,
+			VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-			    obj_type, bonuslen, tx);
-			ASSERT0(err);
+			    obj_type, bonuslen, tx));
 		} else {
 			obj = zap_create_norm(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
@@ -845,10 +815,9 @@
 		}
 	} else {
 		if (zfsvfs->z_replay) {
-			err = dmu_object_claim(zfsvfs->z_os, obj,
+			VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    obj_type, bonuslen, tx);
-			ASSERT0(err);
+			    obj_type, bonuslen, tx));
 		} else {
 			obj = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
@@ -856,7 +825,6 @@
 		}
 	}
 
-	getnewvnode_reserve(1);
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
@@ -1030,8 +998,7 @@
 
 	if (obj_type == DMU_OT_ZNODE ||
 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
-		err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
-		ASSERT0(err);
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
 		vnode_t *vp;
@@ -1043,7 +1010,6 @@
 		KASSERT(err == 0, ("insmntque() failed: error %d", err));
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
-	getnewvnode_drop_reserve();
 }
 
 /*
@@ -1183,54 +1149,55 @@
 	if (hdl != NULL) {
 		zp  = sa_get_userdata(hdl);
 
-
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
-
 		ASSERT3P(zp, !=, NULL);
-
-		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
-		if (zp->z_unlinked) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			vp = ZTOV(zp);
-			*zpp = zp;
-			err = 0;
-		}
-		sa_buf_rele(db, NULL);
+		*zpp = zp;
+		vp = ZTOV(zp);
 
 		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
-		if (err == 0)
-			VN_HOLD(vp);
+		VN_HOLD(vp);
 
-		mutex_exit(&zp->z_lock);
+		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
-		if (err == 0) {
-			locked = VOP_ISLOCKED(vp);
-			VI_LOCK(vp);
-			if ((vp->v_iflag & VI_DOOMED) != 0 &&
-			    locked != LK_EXCLUSIVE) {
-				/*
-				 * The vnode is doomed and this thread doesn't
-				 * hold the exclusive lock on it, so the vnode
-				 * must be being reclaimed by another thread.
-				 * Otherwise the doomed vnode is being reclaimed
-				 * by this thread and zfs_zget is called from
-				 * ZIL internals.
-				 */
-				VI_UNLOCK(vp);
-				VN_RELE(vp);
-				goto again;
-			}
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0 &&
+		    locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
 			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a refrence to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
 		}
+		VI_UNLOCK(vp);
 		getnewvnode_drop_reserve();
-		return (err);
+		return (0);
 	}
 
 	/*
@@ -1255,9 +1222,10 @@
 		vnode_t *vp = ZTOV(zp);
 
 		err = insmntque(vp, zfsvfs->z_vfs);
-		if (err == 0)
+		if (err == 0) {
+			vp->v_hash = obj_num;
 			VOP_UNLOCK(vp, 0);
-		else {
+		} else {
 			zp->z_vnode = NULL;
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
@@ -1345,15 +1313,25 @@
 	}
 
 	/*
-	 * XXXPJD: Not sure how is that possible, but under heavy
-	 * zfs recv -F load it happens that z_gen is the same, but
-	 * vnode type is different than znode type. This would mean
-	 * that for example regular file was replaced with directory
-	 * which has the same object number.
+	 * It is highly improbable but still quite possible that two
+	 * objects in different datasets are created with the same
+	 * object numbers and in transaction groups with the same
+	 * numbers.  znodes corresponding to those objects would
+	 * have the same z_id and z_gen, but their other attributes
+	 * may be different.
+	 * zfs recv -F may replace one of such objects with the other.
+	 * As a result file properties recorded in the replaced
+	 * object's vnode may no longer match the received object's
+	 * properties.  At present the only cached property is the
+	 * files type recorded in v_type.
+	 * So, handle this case by leaving the old vnode and znode
+	 * disassociated from the actual object.  A new vnode and a
+	 * znode will be created if the object is accessed
+	 * (e.g. via a look-up).  The old vnode and znode will be
+	 * recycled when the last vnode reference is dropped.
 	 */
 	vp = ZTOV(zp);
-	if (vp != NULL &&
-	    vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EIO);
@@ -1361,11 +1339,9 @@
 
 	zp->z_unlinked = (zp->z_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
-	if (vp != NULL) {
-		vn_pages_remove(vp, 0, 0);
-		if (zp->z_size != size)
-			vnode_pager_setsize(vp, zp->z_size);
-	}
+	vn_pages_remove(vp, 0, 0);
+	if (zp->z_size != size)
+		vnode_pager_setsize(vp, zp->z_size);
 
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
@@ -1404,20 +1380,16 @@
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
-	mutex_enter(&zp->z_lock);
-
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
-		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
-	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
@@ -1442,7 +1414,9 @@
 
 	kmem_cache_free(znode_cache, zp);
 
+#ifdef illumos
 	VFS_RELE(zfsvfs->z_vfs);
+#endif
 }
 
 void
@@ -1451,7 +1425,7 @@
 {
 	timestruc_t	now;
 
-	gethrestime(&now);
+	vfs_timestamp(&now);
 
 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
 		zp->z_atime_dirty = 0;
@@ -1515,7 +1489,7 @@
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * This is a dummy interface used when pvn_vplist_dirty() should *not*
  * be calling back into the fs for a putpage().  E.g.: when truncating
@@ -1529,7 +1503,7 @@
 	ASSERT(0);
 	return (0);
 }
-#endif	/* sun */
+#endif
 
 /*
  * Increase the file length
@@ -1537,7 +1511,7 @@
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
- * 	RETURN:	0 on success, error code on failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
@@ -1560,7 +1534,6 @@
 		zfs_range_unlock(rl);
 		return (0);
 	}
-top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
@@ -1570,8 +1543,13 @@
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
 			ASSERT(!ISP2(zp->z_blksz));
-			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
@@ -1580,13 +1558,8 @@
 		newblksz = 0;
 	}
 
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
@@ -1616,7 +1589,7 @@
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
- * 	RETURN:	0 on success, error code on failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
@@ -1663,7 +1636,7 @@
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
- * 	RETURN:	0 on success, error code on failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
@@ -1694,17 +1667,12 @@
 		zfs_range_unlock(rl);
 		return (error);
 	}
-top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
@@ -1745,7 +1713,7 @@
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
- * 	RETURN:	0 on success, error code on failure
+ *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
@@ -1795,13 +1763,8 @@
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto log;
-		}
 		dmu_tx_abort(tx);
 		return (error);
 	}
@@ -1823,7 +1786,6 @@
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
-	zfsvfs_t	zfsvfs;
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
@@ -1831,7 +1793,7 @@
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
-	vnode_t		vnode;
+	zfsvfs_t	*zfsvfs;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
@@ -1907,10 +1869,9 @@
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
-	bzero(&zfsvfs, sizeof (zfsvfs_t));
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(rootzp, NULL, 0);
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_moved = 0;
 	rootzp->z_unlinked = 0;
@@ -1917,19 +1878,15 @@
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
 
-	vnode.v_type = VDIR;
-	vnode.v_data = rootzp;
-	rootzp->z_vnode = &vnode;
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
 
-	zfsvfs.z_os = os;
-	zfsvfs.z_parent = &zfsvfs;
-	zfsvfs.z_version = version;
-	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
-	zfsvfs.z_use_sa = USE_SA(version, os);
-	zfsvfs.z_norm = norm;
-
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
-	    &zfsvfs.z_attr_table);
+	    &zfsvfs->z_attr_table);
 
 	ASSERT(error == 0);
 
@@ -1938,16 +1895,16 @@
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
-		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
-	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-	rootzp->z_zfsvfs = &zfsvfs;
+	rootzp->z_zfsvfs = zfsvfs;
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
@@ -1958,7 +1915,6 @@
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	sa_handle_destroy(rootzp->z_sa_hdl);
-	rootzp->z_vnode = NULL;
 	kmem_cache_free(znode_cache, rootzp);
 
 	/*
@@ -1965,14 +1921,14 @@
 	 * Create shares directory
 	 */
 
-	error = zfs_create_share_dir(&zfsvfs, tx);
+	error = zfs_create_share_dir(zfsvfs, tx);
 
 	ASSERT(error == 0);
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
-
 #endif /* _KERNEL */
 
 static int
@@ -2228,3 +2184,35 @@
 	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t parent;
+	int is_xattrdir;
+	int err;
+
+	/* Extended attributes should not be visible as regular files. */
+	if ((zp->z_pflags & ZFS_XATTR) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+	    &parent, &is_xattrdir);
+	if (err != 0)
+		return (err);
+	ASSERT0(is_xattrdir);
+
+	/* No name as this is a root object. */
+	if (parent == zp->z_id)
+		return (SET_ERROR(EINVAL));
+
+	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+	    ZFS_DIRENT_OBJ(-1ULL), buf);
+	if (err != 0)
+		return (err);
+	err = zfs_zget(zfsvfs, parent, dzpp);
+	return (err);
+}
+#endif /* _KERNEL */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -90,10 +91,17 @@
 SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
     "Enable ZFS TRIM");
 
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that executed with lower (asynchronous) priority to
+ * limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_limit = 768 * 1024;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
+    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
+
 static kmem_cache_t *zil_lwb_cache;
 
-static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
-
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
@@ -150,10 +158,15 @@
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
-	const dva_t *dva = BP_IDENTITY(bp);
+	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
+	dva = BP_IDENTITY(bp);
+
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
@@ -189,9 +202,9 @@
     char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
-	uint32_t aflags = ARC_WAIT;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
@@ -228,6 +241,7 @@
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
 				error = SET_ERROR(ECKSUM);
 			} else {
+				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, len);
 				*end = (char *)dst + len;
 				*nbp = zilc->zc_next_blk;
@@ -242,6 +256,8 @@
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
+				ASSERT3U(zilc->zc_nused, <=,
+				    SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, zilc->zc_nused);
 				*end = (char *)dst + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
@@ -248,7 +264,7 @@
 			}
 		}
 
-		VERIFY(arc_buf_remove_ref(abuf, &abuf));
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
@@ -262,9 +278,9 @@
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
-	uint32_t aflags = ARC_WAIT;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
@@ -285,7 +301,7 @@
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
@@ -325,7 +341,7 @@
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
-	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -372,7 +388,7 @@
 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
 	zil_bp_tree_fini(zilog);
-	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
@@ -384,7 +400,8 @@
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
@@ -434,7 +451,8 @@
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+	    !BP_IS_HOLE(bp))
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
@@ -441,7 +459,7 @@
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
@@ -448,6 +466,7 @@
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_slog = slog;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -477,7 +496,7 @@
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
-	if (dsl_dataset_is_snapshot(ds))
+	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
@@ -486,7 +505,28 @@
 	}
 }
 
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
 boolean_t
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
@@ -510,6 +550,7 @@
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -538,7 +579,7 @@
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -548,7 +589,7 @@
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -630,7 +671,7 @@
 }
 
 int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -639,9 +680,17 @@
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+	error = dmu_objset_own_obj(dp, ds->ds_object,
+	    DMU_OST_ANY, B_FALSE, FTAG, &os);
 	if (error != 0) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
+		/*
+		 * EBUSY indicates that the objset is inconsistent, in which
+		 * case it can not have a ZIL.
+		 */
+		if (error != EBUSY) {
+			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+			    (unsigned long long)ds->ds_object, error);
+		}
 		return (0);
 	}
 
@@ -687,8 +736,9 @@
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
+/* ARGSUSED */
 int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
@@ -697,9 +747,10 @@
 
 	ASSERT(tx == NULL);
 
-	error = dmu_objset_hold(osname, FTAG, &os);
+	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
+		cmn_err(CE_WARN, "can't open objset %llu, error %d",
+		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
@@ -722,10 +773,8 @@
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
-		if (!valid) {
-			dmu_objset_rele(os, FTAG);
+		if (!valid)
 			return (0);
-		}
 	}
 
 	/*
@@ -738,8 +787,6 @@
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
-	dmu_objset_rele(os, FTAG);
-
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
@@ -795,7 +842,7 @@
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
-	zio_t *zio;
+	zio_t *zio = NULL;
 
 	ASSERT(zilog->zl_writer);
 
@@ -808,12 +855,13 @@
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL)
+		if (vd != NULL && !vd->vdev_nowritecache) {
+			if (zio == NULL)
+				zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			zio_flush(zio, vd);
+		}
 		kmem_free(zv, sizeof (*zv));
 	}
 
@@ -821,7 +869,8 @@
 	 * Wait for all the flushes to complete.  Not all devices actually
 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
 	 */
-	(void) zio_wait(zio);
+	if (zio)
+		(void) zio_wait(zio);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
@@ -842,7 +891,7 @@
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
 	ASSERT(!BP_IS_GANG(zio->io_bp));
 	ASSERT(!BP_IS_HOLE(zio->io_bp));
-	ASSERT(zio->io_bp->blk_fill == 0);
+	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
 	/*
 	 * Ensure the lwb buffer pointer is cleared before releasing
@@ -872,7 +921,8 @@
 static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
+	zio_priority_t prio;
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -883,9 +933,13 @@
 		    ZIO_FLAG_CANFAIL);
 	}
 	if (lwb->lwb_zio == NULL) {
+		if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
-		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+		    zil_lwb_write_done, lwb, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
@@ -895,7 +949,7 @@
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,		/* non TX_WRITE */
@@ -905,21 +959,11 @@
 };
 
 /*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
-/*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolean_t last)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
@@ -929,6 +973,7 @@
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
+	boolean_t slog;
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -950,7 +995,15 @@
 	 * to clean up in the event of allocation failure or I/O failure.
 	 */
 	tx = dmu_tx_create(zilog->zl_os);
-	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+
+	/*
+	 * Since we are not going to create any new dirty data, and we
+	 * can even help with clearing the existing dirty data, we
+	 * should not be subject to the dirty data based delays. We
+	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
+	 */
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
@@ -977,7 +1030,7 @@
 		continue;
 	zil_blksz = zil_block_buckets[i];
 	if (zil_blksz == UINT64_MAX)
-		zil_blksz = SPA_MAXBLOCKSIZE;
+		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
@@ -985,8 +1038,7 @@
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    USE_SLOG(zilog));
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -995,7 +1047,7 @@
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
@@ -1020,6 +1072,8 @@
 	 */
 	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
 
+	if (last)
+		lwb->lwb_zio->io_pipeline &= ~ZIO_STAGE_ISSUE_ASYNC;
 	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
 	/*
@@ -1032,19 +1086,18 @@
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
-	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lrw = (lr_write_t *)lrc;
+	lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
+	lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
 	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
 	uint64_t dlen = 0;
+	uint64_t dnow, lwb_sp;
 
 	if (lwb == NULL)
 		return (NULL);
 
 	ASSERT(lwb->lwb_buf != NULL);
-	ASSERT(zilog_is_dirty(zilog) ||
-	    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
 		dlen = P2ROUNDUP_TYPED(
@@ -1054,25 +1107,30 @@
 
 	zil_lwb_write_init(zilog, lwb);
 
+cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
+	 * For WR_NEED_COPY optimize layout for minimal number of chunks, but
+	 * try to keep wasted space withing reasonable range (12%).
 	 */
-	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-		lwb = zil_lwb_write_start(zilog, lwb);
+	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+	    lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
+		lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
-		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-			return (lwb);
-		}
+		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+		ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
 	}
 
+	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
-	lrc = (lr_t *)lr_buf;
-	lrw = (lr_write_t *)lrc;
+	lrcb = (lr_t *)lr_buf;
+	lrwb = (lr_write_t *)lrcb;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -1084,16 +1142,19 @@
 			char *dbuf;
 			int error;
 
-			if (dlen) {
-				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
-				lrw->lr_common.lrc_reclen += dlen;
+				lrcb->lrc_reclen += dnow;
+				if (lrwb->lr_length > dnow)
+					lrwb->lr_length = dnow;
+				lrw->lr_offset += dnow;
+				lrw->lr_length -= dnow;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
@@ -1112,12 +1173,18 @@
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	lwb->lwb_nused += reclen + dlen;
+	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+	lwb->lwb_nused += reclen + dnow;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
+	dlen -= dnow;
+	if (dlen > 0) {
+		zilog->zl_cur_used += reclen;
+		goto cont;
+	}
+
 	return (lwb);
 }
 
@@ -1131,7 +1198,6 @@
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
@@ -1280,11 +1346,8 @@
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
-			atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-			itxg->itxg_sod = 0;
 			clean = itxg->itxg_itxs;
 		}
-		ASSERT(itxg->itxg_sod == 0);
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
 
@@ -1296,8 +1359,6 @@
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
-		atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
-		itxg->itxg_sod += itx->itx_sod;
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
@@ -1345,8 +1406,6 @@
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT(itxg->itxg_txg != 0);
 	ASSERT(zilog->zl_clean_taskq != NULL);
-	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-	itxg->itxg_sod = 0;
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
@@ -1370,7 +1429,6 @@
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
-	uint64_t push_sod = 0;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
@@ -1377,6 +1435,11 @@
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing. That's okay since we'll
+	 * only commit things in the future.
+	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
@@ -1386,19 +1449,26 @@
 			continue;
 		}
 
+		/*
+		 * If we're adding itx records to the zl_itx_commit_list,
+		 * then the zil better be dirty in this "txg". We can assert
+		 * that here since we're holding the itxg_lock which will
+		 * prevent spa_sync from cleaning it. Once we add the itxs
+		 * to the zl_itx_commit_list we must commit it to disk even
+		 * if it's unnecessary (i.e. the txg was synced).
+		 */
+		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
-		push_sod += itxg->itxg_sod;
-		itxg->itxg_sod = 0;
 
 		mutex_exit(&itxg->itxg_lock);
 	}
-	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
-static void
+void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
@@ -1411,6 +1481,10 @@
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing.
+	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
@@ -1482,8 +1556,14 @@
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
 	while (itx = list_head(&zilog->zl_itx_commit_list)) {
 		txg = itx->itx_lr.lrc_txg;
-		ASSERT(txg);
+		ASSERT3U(txg, !=, 0);
 
+		/*
+		 * This is inherently racy and may result in us writing
+		 * out a log block for a txg that was just synced. This is
+		 * ok since we'll end cleaning up that log block the next
+		 * time we call zil_sync().
+		 */
 		if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
 		list_remove(&zilog->zl_itx_commit_list, itx);
@@ -1494,7 +1574,7 @@
 
 	/* write the last block out */
 	if (lwb != NULL && lwb->lwb_zio != NULL)
-		lwb = zil_lwb_write_start(zilog, lwb);
+		lwb = zil_lwb_write_start(zilog, lwb, B_TRUE);
 
 	zilog->zl_cur_used = 0;
 
@@ -1800,8 +1880,11 @@
 	mutex_exit(&zilog->zl_lock);
 	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	ASSERT(!zilog_is_dirty(zilog));
 
+	if (zilog_is_dirty(zilog))
+		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
+	VERIFY(!zilog_is_dirty(zilog));
+
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
@@ -1957,7 +2040,7 @@
 static int
 zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
 {
-	char name[MAXNAMELEN];
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
@@ -2077,7 +2160,6 @@
 		zil_destroy(zilog, B_TRUE);
 		return;
 	}
-	//printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
@@ -2099,7 +2181,6 @@
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
-	//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
 }
 
 boolean_t

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,12 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+#include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -37,13 +40,24 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/trim_map.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+#if defined(__amd64__)
+static int zio_use_uma = 1;
+#else
 static int zio_use_uma = 0;
+#endif
 TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
     "Use uma(9) for ZIO allocations");
+static int zio_exclude_metadata = 0;
+TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+    "Exclude metadata buffers from dumps as well");
 
 zio_trim_stats_t zio_trim_stats = {
 	{ "bytes",		KSTAT_DATA_UINT64,
@@ -60,35 +74,18 @@
 
 /*
  * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
-	0,	/* ZIO_PRIORITY_NOW		*/
-	0,	/* ZIO_PRIORITY_SYNC_READ	*/
-	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
-	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
-	1,	/* ZIO_PRIORITY_CACHE_FILL	*/
-	1,	/* ZIO_PRIORITY_AGG		*/
-	4,	/* ZIO_PRIORITY_FREE		*/
-	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
-	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
-	10,	/* ZIO_PRIORITY_RESILVER	*/
-	20,	/* ZIO_PRIORITY_SCRUB		*/
-	2,	/* ZIO_PRIORITY_DDT_PREFETCH	*/
-	30,	/* ZIO_PRIORITY_TRIM		*/
-};
-
-/*
- * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
-char *zio_type_name[ZIO_TYPES] = {
+const char *zio_type_name[ZIO_TYPES] = {
 	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
 	"zio_ioctl"
 };
 
+boolean_t zio_dva_throttle_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
+    &zio_dva_throttle_enabled, 0, "");
+
 /*
  * ==========================================================================
  * I/O kmem caches
@@ -102,8 +99,13 @@
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
-extern int zfs_mg_alloc_failures;
 
+#define	ZIO_PIPELINE_CONTINUE		0x100
+#define	ZIO_PIPELINE_STOP		0x101
+
+#define	BP_SPANB(indblkshift, level) \
+	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
@@ -137,12 +139,16 @@
 
 boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
 
+#ifdef illumos
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
 int zio_buf_debug_limit = 0;
 #endif
+#endif
 
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
 void
 zio_init(void)
 {
@@ -151,20 +157,21 @@
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	if (!zio_use_uma)
+		goto out;
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
-	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-	 * for each quarter-power of 2.  For large buffers, we want
-	 * a cache for each multiple of PAGESIZE.
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
 	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t p2 = size;
 		size_t align = 0;
-		size_t cflags = (size > zio_buf_debug_limit) ? (KMC_NODEBUG|KMC_NOTOUCH) : 0;
+		int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
 
-		while (p2 & (p2 - 1))
+		while (!ISP2(p2))
 			p2 &= p2 - 1;
 
 #ifdef illumos
@@ -181,10 +188,8 @@
 #endif /* illumos */
 		if (size <= 4 * SPA_MINBLOCKSIZE) {
 			align = SPA_MINBLOCKSIZE;
-		} else if (IS_P2ALIGNED(size, PAGESIZE)) {
-			align = PAGESIZE;
 		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
-			align = p2 >> 2;
+			align = MIN(p2 >> 2, PAGESIZE);
 		}
 
 		if (align != 0) {
@@ -201,7 +206,7 @@
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL,
-			    cflags | KMC_NOTOUCH);
+			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
 		}
 	}
 
@@ -214,16 +219,8 @@
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
+out:
 
-	/*
-	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
-	 * to fail 3 times per txg or 8 failures, whichever is greater.
-	 */
-	if (zfs_mg_alloc_failures == 0)
-		zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
-	else if (zfs_mg_alloc_failures < 8)
-		zfs_mg_alloc_failures = 8;
-
 	zio_inject_init();
 
 	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
@@ -281,19 +278,35 @@
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
-void *
-zio_buf_alloc(size_t size)
+static void *
+zio_buf_alloc_impl(size_t size, boolean_t canwait)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-	if (zio_use_uma)
-		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
-	else
-		return (kmem_alloc(size, KM_SLEEP));
+	if (zio_use_uma) {
+		return (kmem_cache_alloc(zio_buf_cache[c],
+		    canwait ? KM_PUSHPAGE : KM_NOSLEEP));
+	} else {
+		return (kmem_alloc(size,
+		    (canwait ? KM_SLEEP : KM_NOSLEEP) | flags));
+	}
 }
 
+void *
+zio_buf_alloc(size_t size)
+{
+	return (zio_buf_alloc_impl(size, B_TRUE));
+}
+
+void *
+zio_buf_alloc_nowait(size_t size)
+{
+	return (zio_buf_alloc_impl(size, B_FALSE));
+}
+
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
@@ -305,7 +318,7 @@
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
@@ -318,7 +331,7 @@
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_buf_cache[c], buf);
@@ -331,7 +344,7 @@
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_data_buf_cache[c], buf);
@@ -344,9 +357,9 @@
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
-static void
+void
 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
-	zio_transform_func_t *transform)
+    zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
@@ -362,7 +375,7 @@
 	zio->io_size = size;
 }
 
-static void
+void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
@@ -411,52 +424,39 @@
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- *        continue calling these functions until they return NULL.
- *        Otherwise, the next caller will pick up the list walk in
- *        some indeterminate state.  (Otherwise every caller would
- *        have to pass in a cookie to keep the state represented by
- *        io_walk_link, which gets annoying.)
- */
 zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
-	zio_link_t *zl = cio->io_walk_link;
 	list_t *pl = &cio->io_parent_list;
 
-	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
-	cio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+	if (*zl == NULL)
 		return (NULL);
 
-	ASSERT(zl->zl_child == cio);
-	return (zl->zl_parent);
+	ASSERT((*zl)->zl_child == cio);
+	return ((*zl)->zl_parent);
 }
 
 zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
-	zio_link_t *zl = pio->io_walk_link;
 	list_t *cl = &pio->io_child_list;
 
-	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
-	pio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+	if (*zl == NULL)
 		return (NULL);
 
-	ASSERT(zl->zl_parent == pio);
-	return (zl->zl_child);
+	ASSERT((*zl)->zl_parent == pio);
+	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
-	zio_t *pio = zio_walk_parents(cio);
+	zio_link_t *zl = NULL;
+	zio_t *pio = zio_walk_parents(cio, &zl);
 
-	VERIFY(zio_walk_parents(cio) == NULL);
+	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
@@ -516,20 +516,26 @@
 }
 
 static boolean_t
-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
-	uint64_t *countp = &zio->io_children[child][wait];
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
-	if (*countp != 0) {
-		zio->io_stage >>= 1;
-		zio->io_stall = countp;
-		waiting = B_TRUE;
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+			continue;
+
+		uint64_t *countp = &zio->io_children[c][wait];
+		if (*countp != 0) {
+			zio->io_stage >>= 1;
+			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+			zio->io_stall = countp;
+			waiting = B_TRUE;
+			break;
+		}
 	}
 	mutex_exit(&zio->io_lock);
-
 	return (waiting);
 }
 
@@ -544,10 +550,22 @@
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
-	if (--*countp == 0 && pio->io_stall == countp) {
+
+	(*countp)--;
+
+	if (*countp == 0 && pio->io_stall == countp) {
+		zio_taskq_type_t type =
+		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
-		zio_execute(pio);
+		/*
+		 * Dispatch the parent zio in its own taskq so that
+		 * the child can continue to make progress. This also
+		 * prevents overflowing the stack when we have deeply nested
+		 * parent-child relationships.
+		 */
+		zio_taskq_dispatch(pio, type, B_FALSE);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
@@ -560,6 +578,45 @@
 		zio->io_error = zio->io_child_error[c];
 }
 
+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_queued_timestamp < z2->io_queued_timestamp)
+		return (-1);
+	if (z1->io_queued_timestamp > z2->io_queued_timestamp)
+		return (1);
+
+	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+		return (-1);
+	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+		return (1);
+
+	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+		return (-1);
+	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+		return (1);
+
+	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+		return (-1);
+	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+		return (1);
+
+	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+		return (-1);
+	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
@@ -568,8 +625,8 @@
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, enum zio_flag flags,
-    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
     enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
@@ -628,6 +685,7 @@
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@@ -675,13 +733,97 @@
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+		    bp, (longlong_t)BP_GET_TYPE(bp));
+	}
+	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
+	}
+	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+		    bp, (longlong_t)BP_GET_COMPRESS(bp));
+	}
+	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+		    bp, (longlong_t)BP_GET_LSIZE(bp));
+	}
+	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+		    bp, (longlong_t)BP_GET_PSIZE(bp));
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+			    bp, (longlong_t)BPE_GET_ETYPE(bp));
+		}
+	}
+
+	/*
+	 * Pool-specific checks.
+	 *
+	 * Note: it would be nice to verify that the blk_birth and
+	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
+	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
+	 * that are in the log) to be arbitrarily large.
+	 */
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+		if (vdevid >= spa->spa_root_vdev->vdev_children) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+		if (vd == NULL) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_hole_ops) {
+			zfs_panic_recover("blkptr at %p DVA %u has hole "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_missing_ops) {
+			/*
+			 * "missing" vdevs are valid during import, but we
+			 * don't have their detailed info (e.g. asize), so
+			 * we can't perform any more checks on them.
+			 */
+			continue;
+		}
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+		if (BP_IS_GANG(bp))
+			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+		if (offset + asize > vd->vdev_asize) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "OFFSET %llu",
+			    bp, i, (longlong_t)offset);
+		}
+	}
+}
+
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
+	zfs_blkptr_verify(spa, bp);
+
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
@@ -694,8 +836,10 @@
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_done_func_t *ready, zio_done_func_t *children_ready,
+    zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private, zio_priority_t priority, enum zio_flag flags,
+    const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
@@ -714,20 +858,32 @@
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
+	zio->io_children_ready = children_ready;
+	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
+	/*
+	 * Data can be NULL if we are going to call zio_write_override() to
+	 * provide the already-allocated BP.  But we may need the data to
+	 * verify a dedup hit (if requested).  In this case, don't try to
+	 * dedup (just take the already-allocated BP verbatim).
+	 */
+	if (data == NULL && zio->io_prop.zp_dedup_verify) {
+		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+	}
+
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private, int priority,
-    enum zio_flag flags, zbookmark_t *zb)
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
@@ -755,8 +911,30 @@
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
+
+	/*
+	 * The check for EMBEDDED is a performance optimization.  We
+	 * process the free here (by ignoring it) rather than
+	 * putting it on the list and then processing it in zio_free_sync().
+	 */
+	if (BP_IS_EMBEDDED(bp))
+		return;
 	metaslab_check_free(spa, bp);
-	bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+
+	/*
+	 * Frees that are for the currently-syncing txg, are not going to be
+	 * deferred, and which will not need to do a read (i.e. not GANG or
+	 * DEDUP), can be processed immediately.  Otherwise, put them on the
+	 * in-memory list for later processing.
+	 */
+	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+	    txg != spa->spa_syncing_txg ||
+	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+	} else {
+		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
+		    BP_GET_PSIZE(bp), 0)));
+	}
 }
 
 zio_t *
@@ -764,20 +942,34 @@
     uint64_t size, enum zio_flag flags)
 {
 	zio_t *zio;
+	enum zio_stage stage = ZIO_FREE_PIPELINE;
 
-	dprintf_bp(bp, "freeing in txg %llu, pass %u",
-	    (longlong_t)txg, spa->spa_sync_pass);
-
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 
+	if (zfs_trim_enabled)
+		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
+		    ZIO_STAGE_VDEV_IO_ASSESS;
+	/*
+	 * GANG and DEDUP blocks can induce a read (for the gang block header,
+	 * or the DDT), so issue them asynchronously so that this thread is
+	 * not tied up.
+	 */
+	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
+		stage |= ZIO_STAGE_ISSUE_ASYNC;
+
+	flags |= ZIO_FLAG_DONT_QUEUE;
+
 	zio = zio_create(pio, spa, txg, bp, NULL, size,
-	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
-	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 	return (zio);
 }
@@ -788,6 +980,11 @@
 {
 	zio_t *zio;
 
+	dprintf_bp(bp, "claiming in txg %llu", txg);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
@@ -807,6 +1004,7 @@
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
@@ -813,8 +1011,8 @@
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
-    uint64_t size, zio_done_func_t *done, void *private, int priority,
-    enum zio_flag flags)
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
@@ -839,7 +1037,7 @@
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -849,8 +1047,8 @@
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
@@ -860,7 +1058,7 @@
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -870,12 +1068,12 @@
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
-	if (zio_checksum_table[checksum].ci_eck) {
+	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
@@ -895,8 +1093,8 @@
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
-	zio_done_func_t *done, void *private)
+    void *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
@@ -915,6 +1113,10 @@
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
+	/* Not all IO types require vdev io done stage e.g. free */
+	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
+		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
+
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
@@ -927,17 +1129,42 @@
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
+	/*
+	 * If we're creating a child I/O that is not associated with a
+	 * top-level vdev, then the child zio is not an allocating I/O.
+	 * If this is a retried I/O then we ignore it since we will
+	 * have already processed the original allocating I/O.
+	 */
+	if (flags & ZIO_FLAG_IO_ALLOCATING &&
+	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+		metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		ASSERT(type == ZIO_TYPE_WRITE);
+		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+		    pio->io_child_type == ZIO_CHILD_GANG);
+
+		flags &= ~ZIO_FLAG_IO_ALLOCATING;
+	}
+
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
+	zio->io_physdone = pio->io_physdone;
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+		zio->io_logical->io_phys_children++;
+
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-	int type, int priority, enum zio_flag flags,
-	zio_done_func_t *done, void *private)
+    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
@@ -945,7 +1172,7 @@
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, done, private, type, priority,
-	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
@@ -966,9 +1193,10 @@
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
-	    NULL, NULL, ZIO_PRIORITY_TRIM,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
+	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
 }
 
 void
@@ -1002,12 +1230,20 @@
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
-		uint64_t psize = BP_GET_PSIZE(bp);
+		uint64_t psize =
+		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(psize);
 
 		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
+	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		decode_embedded_bp_compressed(bp, zio->io_data);
+	} else {
+		ASSERT(!BP_IS_EMBEDDED(bp));
+	}
+
 	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
@@ -1023,22 +1259,6 @@
 static int
 zio_write_bp_init(zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	zio_prop_t *zp = &zio->io_prop;
-	enum zio_compress compress = zp->zp_compress;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t lsize = zio->io_size;
-	uint64_t psize = lsize;
-	int pass = 1;
-
-	/*
-	 * If our children haven't all reached the ready stage,
-	 * wait for them and then repeat this pipeline stage.
-	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
-		return (ZIO_PIPELINE_STOP);
-
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
@@ -1045,6 +1265,9 @@
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
+		blkptr_t *bp = zio->io_bp;
+		zio_prop_t *zp = &zio->io_prop;
+
 		ASSERT(bp->blk_birth != zio->io_txg);
 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
@@ -1051,6 +1274,9 @@
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (BP_IS_EMBEDDED(bp))
+			return (ZIO_PIPELINE_CONTINUE);
+
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
@@ -1058,6 +1284,7 @@
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
+			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
@@ -1067,8 +1294,8 @@
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
-		    zp->zp_dedup_verify);
+		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
@@ -1075,12 +1302,57 @@
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
 		zio->io_bp_override = NULL;
-		BP_ZERO(bp);
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
-	if (bp->blk_birth == zio->io_txg) {
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_prop_t *zp = &zio->io_prop;
+	enum zio_compress compress = zp->zp_compress;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t lsize = zio->io_size;
+	uint64_t psize = lsize;
+	int pass = 1;
+
+	/*
+	 * If our children haven't all reached the ready stage,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
+		return (ZIO_PIPELINE_STOP);
+	}
+
+	if (!IO_IS_ALLOCATING(zio))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	if (zio->io_children_ready != NULL) {
 		/*
+		 * Now that all our children are ready, run the callback
+		 * associated with this zio in case it wants to modify the
+		 * data to be written.
+		 */
+		ASSERT3U(zp->zp_level, >, 0);
+		zio->io_children_ready(zio);
+	}
+
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+	ASSERT(zio->io_bp_override == NULL);
+
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
@@ -1099,7 +1371,7 @@
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
@@ -1109,10 +1381,51 @@
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
+		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			encode_embedded_bp_compressed(bp,
+			    cbuf, compress, lsize, psize);
+			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+			BP_SET_TYPE(bp, zio->io_prop.zp_type);
+			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+			zio_buf_free(cbuf, lsize);
+			bp->blk_birth = zio->io_txg;
+			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+			ASSERT(spa_feature_is_active(spa,
+			    SPA_FEATURE_EMBEDDED_DATA));
+			return (ZIO_PIPELINE_CONTINUE);
 		} else {
-			ASSERT(psize < lsize);
-			zio_push_transform(zio, cbuf, psize, lsize, NULL);
+			/*
+			 * Round up compressed size up to the ashift
+			 * of the smallest-ashift device, and zero the tail.
+			 * This ensures that the compressed size of the BP
+			 * (and thus compressratio property) are correct,
+			 * in that we charge for the padding used to fill out
+			 * the last sector.
+			 */
+			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+			size_t rounded = (size_t)P2ROUNDUP(psize,
+			    1ULL << spa->spa_min_ashift);
+			if (rounded >= lsize) {
+				compress = ZIO_COMPRESS_OFF;
+				zio_buf_free(cbuf, lsize);
+				psize = lsize;
+			} else {
+				bzero((char *)cbuf + psize, rounded - psize);
+				psize = rounded;
+				zio_push_transform(zio, cbuf,
+				    psize, lsize, NULL);
+			}
 		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	/*
@@ -1123,7 +1436,8 @@
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
@@ -1135,15 +1449,22 @@
 	}
 
 	if (psize == 0) {
+		if (zio->io_bp_orig.blk_birth != 0 &&
+		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, zp->zp_type);
+			BP_SET_LEVEL(bp, zp->zp_level);
+			BP_SET_BIRTH(bp, zio->io_txg, 0);
+		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
+		BP_SET_TYPE(bp, zp->zp_type);
+		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
-		BP_SET_TYPE(bp, zp->zp_type);
-		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
@@ -1157,7 +1478,6 @@
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
-
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -1181,11 +1501,11 @@
  */
 
 static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
-	int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
+	int flags = (cutinline ? TQ_FRONT : 0);
 
 	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
 
@@ -1204,31 +1524,43 @@
 		t = ZIO_TYPE_NULL;
 
 	/*
-	 * If this is a high priority I/O, then use the high priority taskq.
+	 * If this is a high priority I/O, then use the high priority taskq if
+	 * available.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
-	    spa->spa_zio_taskq[t][q + 1] != NULL)
+	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
-#ifdef _KERNEL
-	(void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, flags, &zio->io_task);
+
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+#if defined(illumos) || !defined(_KERNEL)
+	ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
-	(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, flags);
+	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
+	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+	    flags, &zio->io_tqent);
 }
 
 static boolean_t
-zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	kthread_t *executor = zio->io_executor;
 	spa_t *spa = zio->io_spa;
 
-	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
-		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
-			return (B_TRUE);
+	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+		uint_t i;
+		for (i = 0; i < tqs->stqs_count; i++) {
+			if (taskq_member(tqs->stqs_taskq[i], executor))
+				return (B_TRUE);
+		}
+	}
 
 	return (B_FALSE);
 }
@@ -1247,6 +1579,58 @@
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
+void
+zio_delay_interrupt(zio_t *zio)
+{
+	/*
+	 * The timeout_generic() function isn't defined in userspace, so
+	 * rather than trying to implement the function, the zio delay
+	 * functionality has been disabled for userspace builds.
+	 */
+
+#ifdef _KERNEL
+	/*
+	 * If io_target_timestamp is zero, then no delay has been registered
+	 * for this IO, thus jump to the end of this function and "skip" the
+	 * delay; issuing it directly to the zio layer.
+	 */
+	if (zio->io_target_timestamp != 0) {
+		hrtime_t now = gethrtime();
+
+		if (now >= zio->io_target_timestamp) {
+			/*
+			 * This IO has already taken longer than the target
+			 * delay to complete, so we don't want to delay it
+			 * any longer; we "miss" the delay and issue it
+			 * directly to the zio layer. This is likely due to
+			 * the target latency being set to a value less than
+			 * the underlying hardware can satisfy (e.g. delay
+			 * set to 1ms, but the disks take 10ms to complete an
+			 * IO request).
+			 */
+
+			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+			    hrtime_t, now);
+
+			zio_interrupt(zio);
+		} else {
+			hrtime_t diff = zio->io_target_timestamp - now;
+
+			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+			    hrtime_t, now, hrtime_t, diff);
+
+			(void) timeout_generic(CALLOUT_NORMAL,
+			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
+		}
+
+		return;
+	}
+#endif
+
+	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+	zio_interrupt(zio);
+}
+
 /*
  * Execute the I/O pipeline until one of the following occurs:
  *
@@ -1270,6 +1654,8 @@
 {
 	zio->io_executor = curthread;
 
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
@@ -1303,7 +1689,8 @@
 		}
 
 		zio->io_stage = stage;
-		rv = zio_pipeline[highbit(stage) - 1](zio);
+		zio->io_pipeline_trace |= zio->io_stage;
+		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
@@ -1326,6 +1713,8 @@
 	ASSERT(zio->io_executor == NULL);
 
 	zio->io_waiter = curthread;
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
 
 	zio_execute(zio);
 
@@ -1354,9 +1743,11 @@
 		 */
 		spa_t *spa = zio->io_spa;
 
-		zio_add_child(spa->spa_async_zio_root, zio);
+		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}
 
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
 	zio_execute(zio);
 }
 
@@ -1381,6 +1772,7 @@
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
+	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
@@ -1397,8 +1789,9 @@
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
-		cio_next = zio_walk_children(pio);
+	zio_link_t *zl = NULL;
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
@@ -1411,8 +1804,10 @@
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on him.
 	 */
-	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+		pio->io_queued_timestamp = gethrtime();
 		zio_execute(pio);
+	}
 }
 
 void
@@ -1756,8 +2151,9 @@
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -1806,6 +2202,7 @@
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
+	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
@@ -1819,10 +2216,43 @@
 	zio_prop_t zp;
 	int error;
 
-	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
-	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
-	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+		flags |= METASLAB_ASYNC_ALLOC;
+		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+		/*
+		 * The logical zio has already placed a reservation for
+		 * 'copies' allocation slots but gang blocks may require
+		 * additional copies. These additional copies
+		 * (i.e. gbh_copies - copies) are guaranteed to succeed
+		 * since metaslab_class_throttle_reserve() always allows
+		 * additional reservations for gang blocks.
+		 */
+		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+		    pio, flags));
+	}
+
+	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
 	if (error) {
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * If we failed to allocate the gang block header then
+			 * we remove any additional allocation reservations that
+			 * we placed here. The original reservation will
+			 * be removed when the logical I/O goes to the ready
+			 * stage.
+			 */
+			metaslab_class_throttle_unreserve(mc,
+			    gbh_copies - copies, pio);
+		}
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
@@ -1861,11 +2291,25 @@
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 
-		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
-		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
-		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
-		    &pio->io_bookmark));
+		    zio_write_gang_member_ready, NULL, NULL, NULL,
+		    &gn->gn_child[g], pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * Gang children won't throttle but we should
+			 * account for their work, so reserve an allocation
+			 * slot for them here.
+			 */
+			VERIFY(metaslab_class_throttle_reserve(mc,
+			    zp.zp_copies, cio, flags));
+		}
+		zio_nowait(cio);
 	}
 
 	/*
@@ -1879,12 +2323,22 @@
 }
 
 /*
- * The zio_nop_write stage in the pipeline determines if allocating
- * a new bp is necessary.  By leveraging a cryptographically secure checksum,
- * such as SHA256, we can compare the checksums of the new data and the old
- * to determine if allocating a new block is required.  The nopwrite
- * feature can handle writes in either syncing or open context (i.e. zil
- * writes) and as a result is mutually exclusive with dedup.
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary.  The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required.  Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions.  To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
  */
 static int
 zio_nop_write(zio_t *zio)
@@ -1907,7 +2361,8 @@
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
-	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -1919,7 +2374,8 @@
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
@@ -2006,8 +2462,9 @@
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
@@ -2064,7 +2521,7 @@
 
 		if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
-			uint32_t aflags = ARC_WAIT;
+			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
@@ -2082,7 +2539,7 @@
 				    bcmp(abuf->b_data, zio->io_orig_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(EEXIST);
-				VERIFY(arc_buf_remove_ref(abuf, &abuf));
+				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
@@ -2111,7 +2568,8 @@
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
-	while ((pio = zio_walk_parents(zio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
@@ -2132,7 +2590,8 @@
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
-		while (zio_walk_parents(zio) != NULL)
+		zio_link_t *zl = NULL;
+		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
@@ -2200,7 +2659,8 @@
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
-		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
@@ -2240,8 +2700,8 @@
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-		    zio->io_orig_size, &czp, NULL,
-		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+		    zio->io_orig_size, &czp, NULL, NULL,
+		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
@@ -2262,7 +2722,8 @@
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+		    zio->io_orig_size, zp,
+		    zio_ddt_child_write_ready, NULL, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
@@ -2308,7 +2769,98 @@
  * Allocate and free blocks
  * ==========================================================================
  */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+	zio = avl_first(&spa->spa_alloc_tree);
+	if (zio == NULL)
+		return (NULL);
+
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Try to place a reservation for this zio. If we're unable to
+	 * reserve then we throttle.
+	 */
+	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+	    zio->io_prop.zp_copies, zio, 0)) {
+		return (NULL);
+	}
+
+	avl_remove(&spa->spa_alloc_tree, zio);
+	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+	return (zio);
+}
+
 static int
+zio_dva_throttle(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_t *nio;
+
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+	    zio->io_child_type == ZIO_CHILD_GANG ||
+	    zio->io_flags & ZIO_FLAG_NODATA) {
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+	mutex_enter(&spa->spa_alloc_lock);
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	avl_add(&spa->spa_alloc_tree, zio);
+
+	nio = zio_io_to_allocate(zio->io_spa);
+	mutex_exit(&spa->spa_alloc_lock);
+
+	if (nio == zio)
+		return (ZIO_PIPELINE_CONTINUE);
+
+	if (nio != NULL) {
+		ASSERT3U(nio->io_queued_timestamp, <=,
+		    zio->io_queued_timestamp);
+		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+		/*
+		 * We are passing control to a new zio so make sure that
+		 * it is processed by a different thread. We do this to
+		 * avoid stack overflows that can occur when parents are
+		 * throttled and children are making progress. We allow
+		 * it to go to the head of the taskq since it's already
+		 * been waiting.
+		 */
+		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+	}
+	return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+	zio_t *zio;
+
+	mutex_enter(&spa->spa_alloc_lock);
+	zio = zio_io_to_allocate(spa);
+	mutex_exit(&spa->spa_alloc_lock);
+	if (zio == NULL)
+		return;
+
+	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+	ASSERT0(zio->io_error);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
+static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -2328,18 +2880,20 @@
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	/*
-	 * The dump device does not support gang blocks so allocation on
-	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
-	 * the "fast" gang feature.
-	 */
-	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
-	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
-	    METASLAB_GANG_CHILD : 0;
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		flags |= METASLAB_DONT_THROTTLE;
+	}
+	if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
+		flags |= METASLAB_GANG_CHILD;
+	}
+	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
+		flags |= METASLAB_ASYNC_ALLOC;
+	}
+
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
 
-	if (error) {
+	if (error != 0) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
@@ -2398,27 +2952,21 @@
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+    uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
-	/*
-	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
-	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
-	 * when allocating them.
-	 */
-	if (use_slog) {
-		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
-	}
-
-	if (error) {
+	error = metaslab_alloc(spa, spa_log_class(spa), size,
+	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+	if (error == 0) {
+		*slog = TRUE;
+	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+		if (error == 0)
+			*slog = FALSE;
 	}
 
 	if (error == 0) {
@@ -2454,6 +3002,18 @@
  * Read, write and delete to physical devices
  * ==========================================================================
  */
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
@@ -2460,6 +3020,7 @@
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
+	int ret;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -2471,20 +3032,24 @@
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
-		return (vdev_mirror_ops.vdev_op_io_start(zio));
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority == ZIO_PRIORITY_NOW) {
 		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
+	ASSERT3P(zio->io_logical, !=, zio);
+
 	/*
 	 * We keep track of time-sensitive I/Os so that the scan thread
 	 * can quickly react to certain workloads.  In particular, we care
 	 * about non-scrubbing, top-level reads and writes with the following
 	 * characteristics:
-	 * 	- synchronous writes of user data to non-slog devices
+	 *	- synchronous writes of user data to non-slog devices
 	 *	- any reads of user data
 	 * When these conditions are met, adjust the timestamp of spa_last_io
 	 * which allows the scan thread to adjust its workload accordingly.
@@ -2501,7 +3066,9 @@
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
-	if (P2PHASE(zio->io_size, align) != 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    P2PHASE(zio->io_size, align) != 0) {
+		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		char *abuf = NULL;
 		if (zio->io_type == ZIO_TYPE_READ ||
@@ -2516,8 +3083,24 @@
 		    zio_subblock);
 	}
 
-	ASSERT(P2PHASE(zio->io_offset, align) == 0);
-	ASSERT(P2PHASE(zio->io_size, align) == 0);
+	/*
+	 * If this is not a physical io, make sure that it is properly aligned
+	 * before proceeding.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+		ASSERT0(P2PHASE(zio->io_offset, align));
+		ASSERT0(P2PHASE(zio->io_size, align));
+	} else {
+		/*
+		 * For the physical io we allow alignment
+		 * to a logical block size.
+		 */
+		uint64_t log_align =
+		    1ULL << vd->vdev_top->vdev_logical_ashift;
+		ASSERT0(P2PHASE(zio->io_offset, log_align));
+		ASSERT0(P2PHASE(zio->io_size, log_align));
+	}
+
 	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
 
 	/*
@@ -2542,34 +3125,37 @@
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+	if (vd->vdev_ops->vdev_op_leaf) {
+		switch (zio->io_type) {
+		case ZIO_TYPE_READ:
+			if (vdev_cache_read(zio))
+				return (ZIO_PIPELINE_CONTINUE);
+			/* FALLTHROUGH */
+		case ZIO_TYPE_WRITE:
+		case ZIO_TYPE_FREE:
+			if ((zio = vdev_queue_io(zio)) == NULL)
+				return (ZIO_PIPELINE_STOP);
 
-		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-			return (ZIO_PIPELINE_CONTINUE);
-
-		if ((zio = vdev_queue_io(zio)) == NULL)
-			return (ZIO_PIPELINE_STOP);
-
-		if (!vdev_accessible(vd, zio)) {
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return (ZIO_PIPELINE_STOP);
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
+				zio_interrupt(zio);
+				return (ZIO_PIPELINE_STOP);
+			}
+			break;
 		}
-	}
-
-	/*
-	 * Note that we ignore repair writes for TRIM because they can conflict
-	 * with normal writes. This isn't an issue because, by definition, we
-	 * only repair blocks that aren't freed.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
-	    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
-		if (!trim_map_write_start(zio))
+		/*
+		 * Note that we ignore repair writes for TRIM because they can
+		 * conflict with normal writes. This isn't an issue because, by
+		 * definition, we only repair blocks that aren't freed.
+		 */
+		if (zio->io_type == ZIO_TYPE_WRITE &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+		    !trim_map_write_start(zio))
 			return (ZIO_PIPELINE_STOP);
 	}
 
-	return (vd->vdev_ops->vdev_op_io_start(zio));
+	vd->vdev_ops->vdev_op_io_start(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
 static int
@@ -2579,14 +3165,16 @@
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FREE)) {
 
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
@@ -2605,7 +3193,10 @@
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error) {
-			if (!vdev_accessible(vd, zio)) {
+			if (zio->io_error == ENOTSUP &&
+			    zio->io_type == ZIO_TYPE_FREE) {
+				/* Not all devices support TRIM. */
+			} else if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
@@ -2652,8 +3243,9 @@
 {
 	vdev_t *vd = zio->io_vd;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
@@ -2666,7 +3258,8 @@
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
-	if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+	if (zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority != ZIO_PRIORITY_NOW) {
 		switch (zio->io_error) {
 		case 0:
 			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
@@ -2679,6 +3272,7 @@
 			ZIO_TRIM_STAT_BUMP(failed);
 			break;
 		}
+	}
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
@@ -2719,6 +3313,13 @@
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    zio->io_physdone != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+		zio->io_physdone(zio->io_logical);
+	}
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -2807,7 +3408,8 @@
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
-		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		if (error == ECKSUM &&
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, zio, zio->io_offset,
 			    zio->io_size, NULL, &info);
@@ -2829,7 +3431,7 @@
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success.  ENXIO indicates whole-device failure,
+ * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
@@ -2862,10 +3464,12 @@
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
+	zio_link_t *zl = NULL;
 
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+	    ZIO_WAIT_READY)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
@@ -2879,12 +3483,26 @@
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 
-	if (zio->io_error)
+	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(IO_IS_ALLOCATING(zio));
+			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			/*
+			 * We were unable to allocate anything, unreserve and
+			 * issue the next I/O to allocate.
+			 */
+			metaslab_class_throttle_unreserve(
+			    spa_normal_class(zio->io_spa),
+			    zio->io_prop.zp_copies, zio);
+			zio_allocate_dispatch(zio->io_spa);
+		}
+	}
+
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
-	pio = zio_walk_parents(zio);
+	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
@@ -2895,7 +3513,7 @@
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
-		pio_next = zio_walk_parents(zio);
+		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
@@ -2915,6 +3533,66 @@
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+	zio_t *lio = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	vdev_t *vd = zio->io_vd;
+	int flags = METASLAB_ASYNC_ALLOC;
+
+	ASSERT3P(zio->io_bp, !=, NULL);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+	ASSERT(vd != NULL);
+	ASSERT3P(vd, ==, vd->vdev_top);
+	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+	/*
+	 * Parents of gang children can have two flavors -- ones that
+	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+	 * and ones that allocated the constituent blocks. The allocation
+	 * throttle needs to know the allocating parent zio so we must find
+	 * it here.
+	 */
+	if (pio->io_child_type == ZIO_CHILD_GANG) {
+		/*
+		 * If our parent is a rewrite gang child then our grandparent
+		 * would have been the one that performed the allocation.
+		 */
+		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+			pio = zio_unique_parent(pio);
+		flags |= METASLAB_GANG_CHILD;
+	}
+
+	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT3P(zio, !=, zio->io_logical);
+	ASSERT(zio->io_logical != NULL);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+	mutex_enter(&pio->io_lock);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	mutex_exit(&pio->io_lock);
+
+	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+	    1, pio);
+
+	/*
+	 * Call into the pipeline to see if there is more work that
+	 * needs to be done. If there is work to be done it will be
+	 * dispatched to another taskq thread.
+	 */
+	zio_allocate_dispatch(zio->io_spa);
+}
+
 static int
 zio_done(zio_t *zio)
 {
@@ -2924,22 +3602,46 @@
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
+	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
+	}
 
+	/*
+	 * If the allocation throttle is enabled, then update the accounting.
+	 * We only track child I/Os that are part of an allocating async
+	 * write. We must do this since the allocation is performed
+	 * by the logical I/O but the actual write is done by child I/Os.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		zio_dva_throttle_done(zio);
+	}
+
+	/*
+	 * If the allocation throttle is enabled, verify that
+	 * we have decremented the refcounts for every I/O that was throttled.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(bp != NULL);
+		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+	}
+
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
-	if (bp != NULL) {
+	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
@@ -3104,13 +3806,15 @@
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
-		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-			zio_link_t *zl = zio->io_walk_link;
-			pio_next = zio_walk_parents(zio);
+		zl = NULL;
+		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+		    pio = pio_next) {
+			zio_link_t *remove_zl = zl;
+			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
-				zio_remove_child(pio, zio, zl);
+				zio_remove_child(pio, zio, remove_zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
@@ -3135,16 +3839,14 @@
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
-#ifdef _KERNEL
-			(void) taskq_dispatch_safe(
-			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP,
-			    &zio->io_task);
+#if defined(illumos) || !defined(_KERNEL)
+			ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
-			(void) taskq_dispatch(
-			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
+			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
+			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
+			    0, &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
@@ -3176,10 +3878,11 @@
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
-	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-		zio_link_t *zl = zio->io_walk_link;
-		pio_next = zio_walk_parents(zio);
-		zio_remove_child(pio, zio, zl);
+	zl = NULL;
+	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+		zio_link_t *remove_zl = zl;
+		pio_next = zio_walk_parents(zio, &zl);
+		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
@@ -3203,9 +3906,10 @@
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
+	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
-	zio_write_bp_init,
+	zio_write_compress,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_ddt_read_start,
@@ -3214,6 +3918,7 @@
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
+	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
@@ -3225,44 +3930,127 @@
 	zio_done
 };
 
-/* dnp is the dnode for zb1->zb_object */
-boolean_t
-zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
 
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb2->zb_level == 0);
 
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects.  Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
 	/*
-	 * A bookmark in the deadlist is considered to be after
-	 * everything else.
+	 * These variables represent the "equivalent" values for the zbookmark,
+	 * after converting zbookmarks inside the meta dnode to their
+	 * normal-object equivalents.
 	 */
-	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-		return (B_TRUE);
+	uint64_t zb1obj, zb2obj;
+	uint64_t zb1L0, zb2L0;
+	uint64_t zb1level, zb2level;
 
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
+	if (zb1->zb_object == zb2->zb_object &&
+	    zb1->zb_level == zb2->zb_level &&
+	    zb1->zb_blkid == zb2->zb_blkid)
+		return (0);
 
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+	/*
+	 * BP_SPANB calculates the span in blocks.
+	 */
+	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
+		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb1L0 = 0;
+		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb1obj = zb1->zb_object;
+		zb1level = zb1->zb_level;
 	}
 
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb2L0 = 0;
+		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb2obj = zb2->zb_object;
+		zb2level = zb2->zb_level;
+	}
+
+	/* Now that we have a canonical representation, do the comparison. */
+	if (zb1obj != zb2obj)
+		return (zb1obj < zb2obj ? -1 : 1);
+	else if (zb1L0 != zb2L0)
+		return (zb1L0 < zb2L0 ? -1 : 1);
+	else if (zb1level != zb2level)
+		return (zb1level > zb2level ? -1 : 1);
+	/*
+	 * This can (theoretically) happen if the bookmarks have the same object
+	 * and level, but different blkids, if the block sizes are not the same.
+	 * There is presently no way to change the indirect block sizes
+	 */
+	return (0);
+}
+
+/*
+ *  This function checks the following: given that last_block is the place that
+ *  our traversal stopped last time, does that guarantee that we've visited
+ *  every node under subtree_root?  Therefore, we can't just use the raw output
+ *  of zbookmark_compare.  We have to pass in a modified version of
+ *  subtree_root; by incrementing the block id, and then checking whether
+ *  last_block is before or equal to that, we can tell whether or not having
+ *  visited last_block implies that all of subtree_root's children have been
+ *  visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	zbookmark_phys_t mod_zb = *subtree_root;
+	mod_zb.zb_blkid++;
+	ASSERT(last_block->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
 		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
+
+	/*
+	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+	 * data block size in sectors, because that variable is only used if
+	 * the bookmark refers to a block in the meta-dnode.  Since we don't
+	 * know without examining it what object it refers to, and there's no
+	 * harm in passing in this value in other cases, we always pass it in.
+	 *
+	 * We pass in 0 for the indirect block size shift because zb2 must be
+	 * level 0.  The indirect block size is only used to calculate the span
+	 * of the bookmark, but since the bookmark must be level 0, the span is
+	 * always 1, so the math works out.
+	 *
+	 * If you make changes to how the zbookmark_compare code works, be sure
+	 * to make sure that this code still works afterwards.
+	 */
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+	    last_block) <= 0);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,11 +21,14 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
@@ -59,28 +62,105 @@
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really).  A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed.  This salt is kept secret (stored on the pool, but
+ * never shown to the user).  Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time.  How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data.  Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context.  The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
  */
 
 /*ARGSUSED*/
 static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_off(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0, 0, "inherit"},
-	{{NULL,			NULL},			0, 0, 0, "on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
+	{{NULL, NULL}, NULL, NULL, 0, "on"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "off"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "label"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "gang_header"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, 0, "fletcher2"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "noparity"},
+#ifdef illumos
+	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
+	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
+	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
 };
 
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+#ifdef illumos
+	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+	switch (cksum) {
+	case ZIO_CHECKSUM_SHA512:
+		return (SPA_FEATURE_SHA512);
+	case ZIO_CHECKSUM_SKEIN:
+		return (SPA_FEATURE_SKEIN);
+	case ZIO_CHECKSUM_EDONR:
+		return (SPA_FEATURE_EDONR);
+	}
+#endif
+	return (SPA_FEATURE_NONE);
+}
+
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
@@ -114,7 +194,8 @@
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
-	ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
@@ -147,21 +228,48 @@
 }
 
 /*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+	if (ci->ci_tmpl_init == NULL)
+		return;
+	if (spa->spa_cksum_tmpls[checksum] != NULL)
+		return;
+
+	VERIFY(ci->ci_tmpl_free != NULL);
+	mutex_enter(&spa->spa_cksum_tmpls_lock);
+	if (spa->spa_cksum_tmpls[checksum] == NULL) {
+		spa->spa_cksum_tmpls[checksum] =
+		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
+		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+	}
+	mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
  * Generate the checksum.
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
-	void *data, uint64_t size)
+    void *data, uint64_t size)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum;
+	spa_t *spa = zio->io_spa;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -180,33 +288,31 @@
 		else
 			bp->blk_cksum = eck->zec_cksum;
 		eck->zec_magic = ZEC_MAGIC;
-		ci->ci_func[0](data, size, &cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &cksum);
 		eck->zec_cksum = cksum;
 	} else {
-		ci->ci_func[0](data, size, &bp->blk_cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &bp->blk_cksum);
 	}
 }
 
 int
-zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
+    void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
 {
-	blkptr_t *bp = zio->io_bp;
-	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
-	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t actual_cksum, expected_cksum;
 	int byteswap;
-	int error;
-	uint64_t size = (bp == NULL ? zio->io_size :
-	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
-	uint64_t offset = zio->io_offset;
-	void *data = zio->io_data;
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum, verifier;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
+		zio_cksum_t verifier;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = data;
@@ -242,35 +348,76 @@
 
 		expected_cksum = eck->zec_cksum;
 		eck->zec_cksum = verifier;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 		eck->zec_cksum = expected_cksum;
 
-		if (byteswap)
+		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
+		}
 	} else {
-		ASSERT(!BP_IS_GANG(bp));
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
-	info->zbc_expected = expected_cksum;
-	info->zbc_actual = actual_cksum;
-	info->zbc_checksum_name = ci->ci_name;
-	info->zbc_byteswapped = byteswap;
-	info->zbc_injected = 0;
-	info->zbc_has_cksum = 1;
+	if (info != NULL) {
+		info->zbc_expected = expected_cksum;
+		info->zbc_actual = actual_cksum;
+		info->zbc_checksum_name = ci->ci_name;
+		info->zbc_byteswapped = byteswap;
+		info->zbc_injected = 0;
+		info->zbc_has_cksum = 1;
+	}
 
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (SET_ERROR(ECKSUM));
 
-	if (zio_injection_enabled && !zio->io_error &&
+	return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	int error;
+	uint64_t size = (bp == NULL ? zio->io_size :
+	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t offset = zio->io_offset;
+	void *data = zio->io_data;
+	spa_t *spa = zio->io_spa;
+
+	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+	    offset, info);
+	if (error != 0 && zio_injection_enabled && !zio->io_error &&
 	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
 
 		info->zbc_injected = 1;
 		return (error);
 	}
+	return (error);
+}
 
-	return (0);
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+	for (enum zio_checksum checksum = 0;
+	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+		if (spa->spa_cksum_tmpls[checksum] != NULL) {
+			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+			VERIFY(ci->ci_tmpl_free != NULL);
+			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+			spa->spa_cksum_tmpls[checksum] = NULL;
+		}
+	}
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -34,10 +34,31 @@
 
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
+#include <sys/kstat.h>
 #include <sys/spa.h>
+#include <sys/zfeature.h>
 #include <sys/zio.h>
 #include <sys/zio_compress.h>
 
+typedef struct zcomp_stats {
+	kstat_named_t zcompstat_attempts;
+	kstat_named_t zcompstat_empty;
+	kstat_named_t zcompstat_skipped_insufficient_gain;
+} zcomp_stats_t;
+
+static zcomp_stats_t zcomp_stats = {
+	{ "attempts",			KSTAT_DATA_UINT64 },
+	{ "empty",			KSTAT_DATA_UINT64 },
+	{ "skipped_insufficient_gain",	KSTAT_DATA_UINT64 }
+};
+
+#define	ZCOMPSTAT_INCR(stat, val) \
+	atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
+
+#define	ZCOMPSTAT_BUMP(stat)		ZCOMPSTAT_INCR(stat, 1);
+
+kstat_t		*zcomp_ksp;
+
 /*
  * Compression vectors.
  */
@@ -62,19 +83,27 @@
 };
 
 enum zio_compress
-zio_compress_select(enum zio_compress child, enum zio_compress parent)
+zio_compress_select(spa_t *spa, enum zio_compress child,
+    enum zio_compress parent)
 {
+	enum zio_compress result;
+
 	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+	ASSERT(parent != ZIO_COMPRESS_INHERIT);
 
-	if (child == ZIO_COMPRESS_INHERIT)
-		return (parent);
+	result = child;
+	if (result == ZIO_COMPRESS_INHERIT)
+		result = parent;
 
-	if (child == ZIO_COMPRESS_ON)
-		return (ZIO_COMPRESS_ON_VALUE);
+	if (result == ZIO_COMPRESS_ON) {
+		if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+			result = ZIO_COMPRESS_LZ4_ON_VALUE;
+		else
+			result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+	}
 
-	return (child);
+	return (result);
 }
 
 size_t
@@ -81,12 +110,14 @@
 zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
 	uint64_t *word, *word_end;
-	size_t c_len, d_len, r_len;
+	size_t c_len, d_len;
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
 	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
 
+	ZCOMPSTAT_BUMP(zcompstat_attempts);
+
 	/*
 	 * If the data is all zeroes, we don't even need to allocate
 	 * a block for it.  We indicate this by returning zero size.
@@ -96,35 +127,24 @@
 		if (*word != 0)
 			break;
 
-	if (word == word_end)
-		return (0);
+	if (word == word_end) {
+		ZCOMPSTAT_BUMP(zcompstat_empty);
+ 		return (0);
+	}
 
 	if (c == ZIO_COMPRESS_EMPTY)
 		return (s_len);
 
 	/* Compress at least 12.5% */
-	d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
-	if (d_len == 0)
-		return (s_len);
-
+	d_len = s_len - (s_len >> 3);
 	c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
 
-	if (c_len > d_len)
+	if (c_len > d_len) {
+		ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
 		return (s_len);
-
-	/*
-	 * Cool.  We compressed at least as much as we were hoping to.
-	 * For both security and repeatability, pad out the last sector.
-	 */
-	r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
-	if (r_len > c_len) {
-		bzero((char *)dst + c_len, r_len - c_len);
-		c_len = r_len;
 	}
 
 	ASSERT3U(c_len, <=, d_len);
-	ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
 	return (c_len);
 }
 
@@ -139,3 +159,26 @@
 
 	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
+
+void
+zio_compress_init(void)
+{
+
+	zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zcomp_ksp != NULL) {
+		zcomp_ksp->ks_data = &zcomp_stats;
+		kstat_install(zcomp_ksp);
+	}
+}
+
+void
+zio_compress_fini(void)
+{
+	if (zcomp_ksp != NULL) {
+		kstat_delete(zcomp_ksp);
+		zcomp_ksp = NULL;
+	}
+}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -50,15 +50,53 @@
 
 uint32_t zio_injection_enabled;
 
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
 	zinject_record_t	zi_record;
+	uint64_t		*zi_lanes;
+	int			zi_next_lane;
 	list_node_t		zi_link;
 } inject_handler_t;
 
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
 static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
 static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
 static int inject_next_id = 1;
 
 /*
@@ -65,7 +103,7 @@
  * Returns true if the given record matches the I/O in progress.
  */
 static boolean_t
-zio_match_handler(zbookmark_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
     zinject_record_t *record, int error)
 {
 	/*
@@ -361,32 +399,164 @@
 	rw_exit(&inject_lock);
 }
 
-uint64_t
+hrtime_t
 zio_handle_io_delay(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	inject_handler_t *handler;
-	uint64_t seconds = 0;
+	inject_handler_t *min_handler = NULL;
+	hrtime_t min_target = 0;
 
-	if (zio_injection_enabled == 0)
+	rw_enter(&inject_lock, RW_READER);
+
+	/*
+	 * inject_delay_count is a subset of zio_injection_enabled that
+	 * is only incremented for delay handlers. These checks are
+	 * mainly added to remind the reader why we're not explicitly
+	 * checking zio_injection_enabled like the other functions.
+	 */
+	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+	/*
+	 * If there aren't any inject delay handlers registered, then we
+	 * can short circuit and simply return 0 here. A value of zero
+	 * informs zio_delay_interrupt() that this request should not be
+	 * delayed. This short circuit keeps us from acquiring the
+	 * inject_delay_mutex unnecessarily.
+	 */
+	if (inject_delay_count == 0) {
+		rw_exit(&inject_lock);
 		return (0);
+	}
 
-	rw_enter(&inject_lock, RW_READER);
+	/*
+	 * Each inject handler has a number of "lanes" associated with
+	 * it. Each lane is able to handle requests independently of one
+	 * another, and at a latency defined by the inject handler
+	 * record's zi_timer field. Thus if a handler in configured with
+	 * a single lane with a 10ms latency, it will delay requests
+	 * such that only a single request is completed every 10ms. So,
+	 * if more than one request is attempted per each 10ms interval,
+	 * the average latency of the requests will be greater than
+	 * 10ms; but if only a single request is submitted each 10ms
+	 * interval the average latency will be 10ms.
+	 *
+	 * We need to acquire this mutex to prevent multiple concurrent
+	 * threads being assigned to the same lane of a given inject
+	 * handler. The mutex allows us to perform the following two
+	 * operations atomically:
+	 *
+	 *	1. determine the minimum handler and minimum target
+	 *	   value of all the possible handlers
+	 *	2. update that minimum handler's lane array
+	 *
+	 * Without atomicity, two (or more) threads could pick the same
+	 * lane in step (1), and then conflict with each other in step
+	 * (2). This could allow a single lane handler to process
+	 * multiple requests simultaneously, which shouldn't be possible.
+	 */
+	mutex_enter(&inject_delay_mtx);
 
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
 			continue;
 
-		if (vd->vdev_guid == handler->zi_record.zi_guid) {
-			seconds = handler->zi_record.zi_timer;
-			break;
+		if (vd->vdev_guid != handler->zi_record.zi_guid)
+			continue;
+
+		/*
+		 * Defensive; should never happen as the array allocation
+		 * occurs prior to inserting this handler on the list.
+		 */
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+
+		/*
+		 * This should never happen, the zinject command should
+		 * prevent a user from setting an IO delay with zero lanes.
+		 */
+		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+		ASSERT3U(handler->zi_record.zi_nlanes, >,
+		    handler->zi_next_lane);
+
+		/*
+		 * We want to issue this IO to the lane that will become
+		 * idle the soonest, so we compare the soonest this
+		 * specific handler can complete the IO with all other
+		 * handlers, to find the lowest value of all possible
+		 * lanes. We then use this lane to submit the request.
+		 *
+		 * Since each handler has a constant value for its
+		 * delay, we can just use the "next" lane for that
+		 * handler; as it will always be the lane with the
+		 * lowest value for that particular handler (i.e. the
+		 * lane that will become idle the soonest). This saves a
+		 * scan of each handler's lanes array.
+		 *
+		 * There's two cases to consider when determining when
+		 * this specific IO request should complete. If this
+		 * lane is idle, we want to "submit" the request now so
+		 * it will complete after zi_timer milliseconds. Thus,
+		 * we set the target to now + zi_timer.
+		 *
+		 * If the lane is busy, we want this request to complete
+		 * zi_timer milliseconds after the lane becomes idle.
+		 * Since the 'zi_lanes' array holds the time at which
+		 * each lane will become idle, we use that value to
+		 * determine when this request should complete.
+		 */
+		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+		hrtime_t busy = handler->zi_record.zi_timer +
+		    handler->zi_lanes[handler->zi_next_lane];
+		hrtime_t target = MAX(idle, busy);
+
+		if (min_handler == NULL) {
+			min_handler = handler;
+			min_target = target;
+			continue;
 		}
 
+		ASSERT3P(min_handler, !=, NULL);
+		ASSERT3U(min_target, !=, 0);
+
+		/*
+		 * We don't yet increment the "next lane" variable since
+		 * we still might find a lower value lane in another
+		 * handler during any remaining iterations. Once we're
+		 * sure we've selected the absolute minimum, we'll claim
+		 * the lane and increment the handler's "next lane"
+		 * field below.
+		 */
+
+		if (target < min_target) {
+			min_handler = handler;
+			min_target = target;
+		}
 	}
+
+	/*
+	 * 'min_handler' will be NULL if no IO delays are registered for
+	 * this vdev, otherwise it will point to the handler containing
+	 * the lane that will become idle the soonest.
+	 */
+	if (min_handler != NULL) {
+		ASSERT3U(min_target, !=, 0);
+		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+		/*
+		 * If we've used all possible lanes for this handler,
+		 * loop back and start using the first lane again;
+		 * otherwise, just increment the lane index.
+		 */
+		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+		    min_handler->zi_record.zi_nlanes;
+	}
+
+	mutex_exit(&inject_delay_mtx);
 	rw_exit(&inject_lock);
-	return (seconds);
+
+	return (min_target);
 }
 
 /*
@@ -410,6 +580,24 @@
 		if ((error = spa_reset(name)) != 0)
 			return (error);
 
+	if (record->zi_cmd == ZINJECT_DELAY_IO) {
+		/*
+		 * A value of zero for the number of lanes or for the
+		 * delay time doesn't make sense.
+		 */
+		if (record->zi_timer == 0 || record->zi_nlanes == 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The number of lanes is directly mapped to the size of
+		 * an array used by the handler. Thus, to ensure the
+		 * user doesn't trigger an allocation that's "too large"
+		 * we cap the number of lanes here.
+		 */
+		if (record->zi_nlanes >= UINT16_MAX)
+			return (SET_ERROR(EINVAL));
+	}
+
 	if (!(flags & ZINJECT_NULL)) {
 		/*
 		 * spa_inject_ref() will add an injection reference, which will
@@ -421,13 +609,36 @@
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			handler->zi_lanes = kmem_zalloc(
+			    sizeof (*handler->zi_lanes) *
+			    handler->zi_record.zi_nlanes, KM_SLEEP);
+			handler->zi_next_lane = 0;
+		} else {
+			handler->zi_lanes = NULL;
+			handler->zi_next_lane = 0;
+		}
+
 		rw_enter(&inject_lock, RW_WRITER);
 
+		/*
+		 * We can't move this increment into the conditional
+		 * above because we need to hold the RW_WRITER lock of
+		 * inject_lock, and we don't want to hold that while
+		 * allocating the handler's zi_lanes array.
+		 */
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			ASSERT3S(inject_delay_count, >=, 0);
+			inject_delay_count++;
+			ASSERT3S(inject_delay_count, >, 0);
+		}
+
 		*id = handler->zi_id = inject_next_id++;
-		handler->zi_spa = spa;
-		handler->zi_record = *record;
 		list_insert_tail(&inject_handlers, handler);
-		atomic_add_32(&zio_injection_enabled, 1);
+		atomic_inc_32(&zio_injection_enabled);
 
 		rw_exit(&inject_lock);
 	}
@@ -439,7 +650,11 @@
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush(NULL);
+		/*
+		 * We must use FALSE to ensure arc_flush returns, since
+		 * we're not preventing concurrent ARC insertions.
+		 */
+		arc_flush(NULL, FALSE);
 
 	return (0);
 }
@@ -499,12 +714,26 @@
 		return (SET_ERROR(ENOENT));
 	}
 
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3S(inject_delay_count, >, 0);
+		inject_delay_count--;
+		ASSERT3S(inject_delay_count, >=, 0);
+	}
+
 	list_remove(&inject_handlers, handler);
 	rw_exit(&inject_lock);
 
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+		    handler->zi_record.zi_nlanes);
+	} else {
+		ASSERT3P(handler->zi_lanes, ==, NULL);
+	}
+
 	spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
-	atomic_add_32(&zio_injection_enabled, -1);
+	atomic_dec_32(&zio_injection_enabled);
 
 	return (0);
 }
@@ -513,6 +742,7 @@
 zio_inject_init(void)
 {
 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
@@ -521,5 +751,6 @@
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
+	mutex_destroy(&inject_delay_mtx);
 	rw_destroy(&inject_lock);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,6 +21,8 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
  */
 
 /*
@@ -43,7 +45,7 @@
  * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
  * treated as zero references.
  */
-#define	ZRL_LOCKED	((uint32_t)-1)
+#define	ZRL_LOCKED	-1
 #define	ZRL_DESTROYED	-2
 
 void
@@ -61,7 +63,7 @@
 void
 zrl_destroy(zrlock_t *zrl)
 {
-	ASSERT(zrl->zr_refcount == 0);
+	ASSERT0(zrl->zr_refcount);
 
 	mutex_destroy(&zrl->zr_mtx);
 	zrl->zr_refcount = ZRL_DESTROYED;
@@ -69,43 +71,34 @@
 }
 
 void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+	for (;;) {
+		uint32_t n = (uint32_t)zrl->zr_refcount;
+		while (n != ZRL_LOCKED) {
+			uint32_t cas = atomic_cas_32(
+			    (uint32_t *)&zrl->zr_refcount, n, n + 1);
+			if (cas == n) {
+				ASSERT3S((int32_t)n, >=, 0);
 #ifdef	ZFS_DEBUG
-zrl_add_debug(zrlock_t *zrl, const char *zc)
-#else
-zrl_add(zrlock_t *zrl)
+				if (zrl->zr_owner == curthread) {
+					DTRACE_PROBE2(zrlock__reentry,
+					    zrlock_t *, zrl, uint32_t, n);
+				}
+				zrl->zr_owner = curthread;
+				zrl->zr_caller = zc;
 #endif
-{
-	uint32_t n = (uint32_t)zrl->zr_refcount;
-
-	while (n != ZRL_LOCKED) {
-		uint32_t cas = atomic_cas_32(
-		    (uint32_t *)&zrl->zr_refcount, n, n + 1);
-		if (cas == n) {
-			ASSERT((int32_t)n >= 0);
-#ifdef	ZFS_DEBUG
-			if (zrl->zr_owner == curthread) {
-				DTRACE_PROBE2(zrlock__reentry,
-				    zrlock_t *, zrl, uint32_t, n);
+				return;
 			}
-			zrl->zr_owner = curthread;
-			zrl->zr_caller = zc;
-#endif
-			return;
+			n = cas;
 		}
-		n = cas;
-	}
 
-	mutex_enter(&zrl->zr_mtx);
-	while (zrl->zr_refcount == ZRL_LOCKED) {
-		cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+		mutex_enter(&zrl->zr_mtx);
+		while (zrl->zr_refcount == ZRL_LOCKED) {
+			cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+		}
+		mutex_exit(&zrl->zr_mtx);
 	}
-	ASSERT(zrl->zr_refcount >= 0);
-	zrl->zr_refcount++;
-#ifdef	ZFS_DEBUG
-	zrl->zr_owner = curthread;
-	zrl->zr_caller = zc;
-#endif
-	mutex_exit(&zrl->zr_mtx);
 }
 
 void
@@ -113,8 +106,6 @@
 {
 	uint32_t n;
 
-	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
-	ASSERT((int32_t)n >= 0);
 #ifdef	ZFS_DEBUG
 	if (zrl->zr_owner == curthread) {
 		zrl->zr_owner = NULL;
@@ -121,6 +112,8 @@
 		zrl->zr_caller = NULL;
 	}
 #endif
+	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+	ASSERT3S((int32_t)n, >=, 0);
 }
 
 int
@@ -133,7 +126,7 @@
 		    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
 		if (cas == 0) {
 #ifdef	ZFS_DEBUG
-			ASSERT(zrl->zr_owner == NULL);
+			ASSERT3P(zrl->zr_owner, ==, NULL);
 			zrl->zr_owner = curthread;
 #endif
 			return (1);
@@ -140,7 +133,7 @@
 		}
 	}
 
-	ASSERT((int32_t)n > ZRL_DESTROYED);
+	ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
 
 	return (0);
 }
@@ -148,11 +141,11 @@
 void
 zrl_exit(zrlock_t *zrl)
 {
-	ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+	ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
 
 	mutex_enter(&zrl->zr_mtx);
 #ifdef	ZFS_DEBUG
-	ASSERT(zrl->zr_owner == curthread);
+	ASSERT3P(zrl->zr_owner, ==, curthread);
 	zrl->zr_owner = NULL;
 	membar_producer();	/* make sure the owner store happens first */
 #endif
@@ -164,7 +157,7 @@
 int
 zrl_refcount(zrlock_t *zrl)
 {
-	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
 	int n = (int)zrl->zr_refcount;
 	return (n <= 0 ? 0 : n);
@@ -173,7 +166,7 @@
 int
 zrl_is_zero(zrlock_t *zrl)
 {
-	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
 	return (zrl->zr_refcount <= 0);
 }
@@ -181,7 +174,7 @@
 int
 zrl_is_locked(zrlock_t *zrl)
 {
-	ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
 
 	return (zrl->zr_refcount == ZRL_LOCKED);
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,10 +24,15 @@
  *
  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd at FreeBSD.org>
  * All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
-/* Portions Copyright 2010 Robert Milkowski */
 /* Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org> */
 
 /*
@@ -61,7 +66,9 @@
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
+#include <sys/disk.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dnode.h>
 #include <sys/dsl_dataset.h>
@@ -71,6 +78,7 @@
 #include <sys/sunddi.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
+#include <sys/queue.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zil.h>
@@ -78,13 +86,20 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zil_impl.h>
 #include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/filio.h>
+
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
 
+#ifndef illumos
 struct g_class zfs_zvol_class = {
 	.name = "ZFS::ZVOL",
 	.version = G_VERSION,
@@ -92,6 +107,7 @@
 
 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
+#endif
 void *zfsdev_state;
 static char *zvol_tag = "zvol_tag";
 
@@ -98,13 +114,31 @@
 #define	ZVOL_DUMPSIZE		"dumpsize"
 
 /*
- * The spa_namespace_lock protects the zfsdev_state structure from being
- * modified while it's being used, e.g. an open that comes in before a
- * create finishes.  It also protects temporary opens of the dataset so that,
+ * This lock protects the zfsdev_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes.  It also protects temporary opens of the dataset so that,
  * e.g., an open doesn't get a spurious EBUSY.
  */
+#ifdef illumos
+kmutex_t zfsdev_state_lock;
+#else
+/*
+ * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
+ * spa_namespace_lock in the ZVOL code.
+ */
+#define zfsdev_state_lock spa_namespace_lock
+#endif
 static uint32_t zvol_minors;
 
+#ifndef illumos
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+static int	volmode = ZFS_VOLMODE_GEOM;
+TUNABLE_INT("vfs.zfs.vol.mode", &volmode);
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
+    "Expose as GEOM providers (1), device files (2) or neither");
+
+#endif
 typedef struct zvol_extent {
 	list_node_t	ze_node;
 	dva_t		ze_dva;		/* dva associated with this extent */
@@ -115,23 +149,41 @@
  * The in-core state of each volume.
  */
 typedef struct zvol_state {
+#ifndef illumos
+	LIST_ENTRY(zvol_state)	zv_links;
+#endif
 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
 	uint64_t	zv_volsize;	/* amount of space we advertise */
 	uint64_t	zv_volblocksize; /* volume block size */
+#ifdef illumos
+	minor_t		zv_minor;	/* minor number */
+#else
+	struct cdev	*zv_dev;	/* non-GEOM device */
 	struct g_provider *zv_provider;	/* GEOM provider */
+#endif
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 	objset_t	*zv_objset;	/* objset handle */
+#ifdef illumos
+	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
+#endif
 	uint32_t	zv_total_opens;	/* total open count */
+	uint32_t	zv_sync_cnt;	/* synchronous open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
 	znode_t		zv_znode;	/* for range locking */
 	dmu_buf_t	*zv_dbuf;	/* bonus handle */
+#ifndef illumos
 	int		zv_state;
+	int		zv_volmode;	/* Provide GEOM or cdev */
 	struct bio_queue_head zv_queue;
 	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
+#endif
 } zvol_state_t;
 
+#ifndef illumos
+static LIST_HEAD(, zvol_state) all_zvols;
+#endif
 /*
  * zvol specific flags
  */
@@ -145,6 +197,43 @@
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+#ifndef illumos
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+    &zvol_unmap_enabled, 0,
+    "Enable UNMAP functionality");
+
+static d_open_t		zvol_d_open;
+static d_close_t	zvol_d_close;
+static d_read_t		zvol_read;
+static d_write_t	zvol_write;
+static d_ioctl_t	zvol_d_ioctl;
+static d_strategy_t	zvol_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	zvol_d_open,
+	.d_close =	zvol_d_close,
+	.d_read =	zvol_read,
+	.d_write =	zvol_write,
+	.d_ioctl =	zvol_d_ioctl,
+	.d_strategy =	zvol_strategy,
+	.d_name =	"zvol",
+	.d_flags =	D_DISK | D_TRACKCLOSE,
+};
+
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
+    uint64_t len, boolean_t sync);
+#endif	/* !illumos */
+
 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
     nvlist_t *, nvlist_t *);
 static int zvol_remove_zv(zvol_state_t *);
@@ -153,19 +242,13 @@
 static int zvol_dump_fini(zvol_state_t *zv);
 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 
-static zvol_state_t *zvol_geom_create(const char *name);
-static void zvol_geom_run(zvol_state_t *zv);
-static void zvol_geom_destroy(zvol_state_t *zv);
-static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
-static void zvol_geom_start(struct bio *bp);
-static void zvol_geom_worker(void *arg);
-
 static void
-zvol_size_changed(zvol_state_t *zv)
+zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 {
-#ifdef sun
-	dev_t dev = makedevice(maj, min);
+#ifdef illumos
+	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 
+	zv->zv_volsize = volsize;
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 	    "Size", volsize) == DDI_SUCCESS);
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
@@ -174,22 +257,19 @@
 	/* Notify specfs to invalidate the cached size */
 	spec_size_invalidate(dev, VBLK);
 	spec_size_invalidate(dev, VCHR);
-#else	/* !sun */
-	struct g_provider *pp;
+#else	/* !illumos */
+	zv->zv_volsize = volsize;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct g_provider *pp;
 
-	pp = zv->zv_provider;
-	if (pp == NULL)
-		return;
-	if (zv->zv_volsize == pp->mediasize)
-		return;
-	/*
-	 * Changing provider size is not really supported by GEOM, but it
-	 * should be safe when provider is closed.
-	 */
-	if (zv->zv_total_opens > 0)
-		return;
-	pp->mediasize = zv->zv_volsize;
-#endif	/* !sun */
+		pp = zv->zv_provider;
+		if (pp == NULL)
+			return;
+		g_topology_lock();
+		g_resize_provider(pp, zv->zv_volsize);
+		g_topology_unlock();
+	}
+#endif	/* illumos */
 }
 
 int
@@ -212,7 +292,7 @@
 zvol_check_volblocksize(uint64_t volblocksize)
 {
 	if (volblocksize < SPA_MINBLOCKSIZE ||
-	    volblocksize > SPA_MAXBLOCKSIZE ||
+	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
@@ -245,26 +325,26 @@
 static zvol_state_t *
 zvol_minor_lookup(const char *name)
 {
-	struct g_provider *pp;
-	struct g_geom *gp;
-	zvol_state_t *zv = NULL;
+#ifdef illumos
+	minor_t minor;
+#endif
+	zvol_state_t *zv;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
-	g_topology_lock();
-	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
-		pp = LIST_FIRST(&gp->provider);
-		if (pp == NULL)
-			continue;
-		zv = pp->private;
+#ifdef illumos
+	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 		if (zv == NULL)
 			continue;
+#else
+	LIST_FOREACH(zv, &all_zvols, zv_links) {
+#endif
 		if (strcmp(zv->zv_name, name) == 0)
-			break;
+			return (zv);
 	}
-	g_topology_unlock();
 
-	return (gp != NULL ? zv : NULL);
+	return (NULL);
 }
 
 /* extent mapping arg */
@@ -276,15 +356,18 @@
 /*ARGSUSED*/
 static int
 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;
 	zvol_extent_t *ze;
 	int bs = ma->ma_zv->zv_volblocksize;
 
-	if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+	if (bp == NULL || BP_IS_HOLE(bp) ||
+	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 		return (0);
 
+	VERIFY(!BP_IS_EMBEDDED(bp));
+
 	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 	ma->ma_blks++;
 
@@ -385,6 +468,24 @@
 }
 
 /*
+ * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
+{
+	uint64_t offset, length;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
@@ -434,7 +535,7 @@
 
 /*
  * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* 0 no such transaction type */
@@ -447,7 +548,7 @@
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
-	zvol_replay_err,	/* TX_TRUNCATE */
+	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ACL */
@@ -459,20 +560,20 @@
 	zvol_replay_err,	/* TX_WRITE2 */
 };
 
-#ifdef sun
+#ifdef illumos
 int
 zvol_name2minor(const char *name, minor_t *minor)
 {
 	zvol_state_t *zv;
 
-	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&zfsdev_state_lock);
 	zv = zvol_minor_lookup(name);
 	if (minor && zv)
 		*minor = zv->zv_minor;
-	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (zv ? 0 : -1);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 /*
  * Create a minor node (plus a whole lot more) for the specified volume.
@@ -484,15 +585,24 @@
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t doi;
-	uint64_t volsize;
+#ifdef illumos
+	minor_t minor = 0;
+	char chrbuf[30], blkbuf[30];
+#else
+	struct g_provider *pp;
+	struct g_geom *gp;
+	uint64_t volsize, mode;
+#endif
 	int error;
 
+#ifndef illumos
 	ZFS_LOG(1, "Creating ZVOL %s...", name);
+#endif
 
-	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&zfsdev_state_lock);
 
 	if (zvol_minor_lookup(name) != NULL) {
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
@@ -500,20 +610,20 @@
 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 
 	if (error) {
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (error);
 	}
 
-#ifdef sun
+#ifdef illumos
 	if ((minor = zfsdev_minor_alloc()) == 0) {
 		dmu_objset_disown(os, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 		dmu_objset_disown(os, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
@@ -525,7 +635,7 @@
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 
@@ -536,7 +646,7 @@
 		ddi_remove_minor_node(zfs_dip, chrbuf);
 		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, FTAG);
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(EAGAIN));
 	}
 
@@ -543,26 +653,68 @@
 	zs = ddi_get_soft_state(zfsdev_state, minor);
 	zs->zss_type = ZSST_ZVOL;
 	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
-#else	/* !sun */
+#else	/* !illumos */
 
+	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+	zv->zv_state = 0;
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
-		ASSERT(error == 0);
+		kmem_free(zv, sizeof(*zv));
 		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (error);
 	}
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
+	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
+		mode = volmode;
 
 	DROP_GIANT();
-	g_topology_lock();
-	zv = zvol_geom_create(name);
 	zv->zv_volsize = volsize;
-	zv->zv_provider->mediasize = zv->zv_volsize;
+	zv->zv_volmode = mode;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = zv->zv_volsize;
+		pp->private = zv;
 
-#endif	/* !sun */
+		zv->zv_provider = pp;
+		bioq_init(&zv->zv_queue);
+		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct make_dev_args args;
 
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		error = make_dev_s(&args, &zv->zv_dev,
+		    "%s/%s", ZVOL_DRIVER, name);
+		if (error != 0) {
+			kmem_free(zv, sizeof(*zv));
+			dmu_objset_disown(os, FTAG);
+			mutex_exit(&zfsdev_state_lock);
+			return (error);
+		}
+		zv->zv_dev->si_iosize_max = MAXPHYS;
+	}
+	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
+#endif	/* illumos */
+
 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	zv->zv_min_bs = DEV_BSHIFT;
+#ifdef illumos
+	zv->zv_minor = minor;
+#endif
 	zv->zv_objset = os;
 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
@@ -587,14 +739,16 @@
 
 	zvol_minors++;
 
-	mutex_exit(&spa_namespace_lock);
-
-	zvol_geom_run(zv);
-
-	g_topology_unlock();
+	mutex_exit(&zfsdev_state_lock);
+#ifndef illumos
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		zvol_geom_run(zv);
+		g_topology_unlock();
+	}
 	PICKUP_GIANT();
 
 	ZFS_LOG(1, "ZVOL %s created.", name);
+#endif
 
 	return (0);
 }
@@ -605,26 +759,42 @@
 static int
 zvol_remove_zv(zvol_state_t *zv)
 {
-#ifdef sun
+#ifdef illumos
+	char nmbuf[20];
 	minor_t minor = zv->zv_minor;
 #endif
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 	if (zv->zv_total_opens != 0)
 		return (SET_ERROR(EBUSY));
 
-	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
-
-#ifdef sun
+#ifdef illumos
 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 	ddi_remove_minor_node(zfs_dip, nmbuf);
-#endif	/* sun */
 
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
+#else
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	LIST_REMOVE(zv, zv_links);
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		zvol_geom_destroy(zv);
+		g_topology_unlock();
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		if (zv->zv_dev != NULL)
+			destroy_dev(zv->zv_dev);
+	}
+#endif
+
 	avl_destroy(&zv->zv_znode.z_range_avl);
 	mutex_destroy(&zv->zv_znode.z_range_lock);
 
-	zvol_geom_destroy(zv);
-
+	kmem_free(zv, sizeof (zvol_state_t));
+#ifdef illumos
+	ddi_soft_state_free(zfsdev_state, minor);
+#endif
 	zvol_minors--;
 	return (0);
 }
@@ -635,15 +805,13 @@
 	zvol_state_t *zv;
 	int rc;
 
-	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&zfsdev_state_lock);
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
-	g_topology_lock();
 	rc = zvol_remove_zv(zv);
-	g_topology_unlock();
-	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (rc);
 }
 
@@ -661,6 +829,7 @@
 	if (error)
 		return (error);
 
+	zv->zv_objset = os;
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
 		ASSERT(error == 0);
@@ -667,15 +836,15 @@
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
 	}
-	zv->zv_objset = os;
+
 	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 	if (error) {
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
 	}
-	zv->zv_volsize = volsize;
+
+	zvol_size_changed(zv, volsize);
 	zv->zv_zilog = zil_open(os, zvol_get_data);
-	zvol_size_changed(zv);
 
 	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 	    NULL) == 0);
@@ -708,7 +877,7 @@
 	zv->zv_objset = NULL;
 }
 
-#ifdef sun
+#ifdef illumos
 int
 zvol_prealloc(zvol_state_t *zv)
 {
@@ -728,7 +897,7 @@
 
 	while (resid != 0) {
 		int error;
-		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -747,7 +916,7 @@
 
 	return (0);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 static int
 zvol_update_volsize(objset_t *os, uint64_t volsize)
@@ -755,10 +924,11 @@
 	dmu_tx_t *tx;
 	int error;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -778,95 +948,86 @@
 void
 zvol_remove_minors(const char *name)
 {
-	struct g_geom *gp, *gptmp;
-	struct g_provider *pp;
+#ifdef illumos
 	zvol_state_t *zv;
+	char *namebuf;
+	minor_t minor;
+
+	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+	(void) strncpy(namebuf, name, strlen(name));
+	(void) strcat(namebuf, "/");
+	mutex_enter(&zfsdev_state_lock);
+	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+
+		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+		if (zv == NULL)
+			continue;
+		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+			(void) zvol_remove_zv(zv);
+	}
+	kmem_free(namebuf, strlen(name) + 2);
+
+	mutex_exit(&zfsdev_state_lock);
+#else	/* !illumos */
+	zvol_state_t *zv, *tzv;
 	size_t namelen;
 
 	namelen = strlen(name);
 
 	DROP_GIANT();
-	mutex_enter(&spa_namespace_lock);
-	g_topology_lock();
+	mutex_enter(&zfsdev_state_lock);
 
-	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
-		pp = LIST_FIRST(&gp->provider);
-		if (pp == NULL)
-			continue;
-		zv = pp->private;
-		if (zv == NULL)
-			continue;
+	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
 		if (strcmp(zv->zv_name, name) == 0 ||
 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
-		     zv->zv_name[namelen] == '/')) {
+		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
+		    zv->zv_name[namelen] == '@'))) {
 			(void) zvol_remove_zv(zv);
 		}
 	}
 
-	g_topology_unlock();
-	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&zfsdev_state_lock);
 	PICKUP_GIANT();
+#endif	/* illumos */
 }
 
-int
-zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
+static int
+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 {
-	zvol_state_t *zv = NULL;
-	objset_t *os;
-	int error;
-	dmu_object_info_t doi;
 	uint64_t old_volsize = 0ULL;
-	uint64_t readonly;
+	int error = 0;
 
-	mutex_enter(&spa_namespace_lock);
-	zv = zvol_minor_lookup(name);
-	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
-	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
-	    (error = zvol_check_volsize(volsize,
-	    doi.doi_data_block_size)) != 0)
-		goto out;
-
-	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
-	    NULL) == 0);
-	if (readonly) {
-		error = EROFS;
-		goto out;
-	}
-
-	error = zvol_update_volsize(os, volsize);
 	/*
 	 * Reinitialize the dump area to the new size. If we
 	 * failed to resize the dump area then restore it back to
-	 * its original size.
+	 * its original size.  We must set the new volsize prior
+	 * to calling dumpvp_resize() to ensure that the devices'
+	 * size(9P) is not visible by the dump subsystem.
 	 */
-	if (zv && error == 0) {
+	old_volsize = zv->zv_volsize;
+	zvol_size_changed(zv, volsize);
+
 #ifdef ZVOL_DUMP
-		if (zv->zv_flags & ZVOL_DUMPIFIED) {
-			old_volsize = zv->zv_volsize;
-			zv->zv_volsize = volsize;
-			if ((error = zvol_dumpify(zv)) != 0 ||
-			    (error = dumpvp_resize()) != 0) {
-				(void) zvol_update_volsize(os, old_volsize);
-				zv->zv_volsize = old_volsize;
-				error = zvol_dumpify(zv);
-			}
+	if (zv->zv_flags & ZVOL_DUMPIFIED) {
+		if ((error = zvol_dumpify(zv)) != 0 ||
+		    (error = dumpvp_resize()) != 0) {
+			int dumpify_error;
+
+			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
+			zvol_size_changed(zv, old_volsize);
+			dumpify_error = zvol_dumpify(zv);
+			error = dumpify_error ? dumpify_error : error;
 		}
+	}
 #endif	/* ZVOL_DUMP */
-		if (error == 0) {
-			zv->zv_volsize = volsize;
-			zvol_size_changed(zv);
-		}
-	}
 
-#ifdef sun
+#ifdef illumos
 	/*
 	 * Generate a LUN expansion event.
 	 */
-	if (zv && error == 0) {
+	if (error == 0) {
 		sysevent_id_t eid;
 		nvlist_t *attr;
 		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
@@ -883,44 +1044,79 @@
 		nvlist_free(attr);
 		kmem_free(physpath, MAXPATHLEN);
 	}
-#endif	/* sun */
+#endif	/* illumos */
+	return (error);
+}
 
-out:
-	dmu_objset_rele(os, FTAG);
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+	zvol_state_t *zv = NULL;
+	objset_t *os;
+	int error;
+	dmu_object_info_t doi;
+	uint64_t readonly;
+	boolean_t owned = B_FALSE;
 
-	mutex_exit(&spa_namespace_lock);
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+	if (error != 0)
+		return (error);
+	if (readonly)
+		return (SET_ERROR(EROFS));
 
+	mutex_enter(&zfsdev_state_lock);
+	zv = zvol_minor_lookup(name);
+
+	if (zv == NULL || zv->zv_objset == NULL) {
+		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+		    FTAG, &os)) != 0) {
+			mutex_exit(&zfsdev_state_lock);
+			return (error);
+		}
+		owned = B_TRUE;
+		if (zv != NULL)
+			zv->zv_objset = os;
+	} else {
+		os = zv->zv_objset;
+	}
+
+	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
+	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
+		goto out;
+
+	error = zvol_update_volsize(os, volsize);
+
+	if (error == 0 && zv != NULL)
+		error = zvol_update_live_volsize(zv, volsize);
+out:
+	if (owned) {
+		dmu_objset_disown(os, FTAG);
+		if (zv != NULL)
+			zv->zv_objset = NULL;
+	}
+	mutex_exit(&zfsdev_state_lock);
 	return (error);
 }
 
 /*ARGSUSED*/
+#ifdef illumos
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#else
 static int
 zvol_open(struct g_provider *pp, int flag, int count)
+#endif
 {
 	zvol_state_t *zv;
 	int err = 0;
-	boolean_t locked = B_FALSE;
+#ifdef illumos
 
-	/*
-	 * Protect against recursively entering spa_namespace_lock
-	 * when spa_open() is used for a pool on a (local) ZVOL(s).
-	 * This is needed since we replaced upstream zfsdev_state_lock
-	 * with spa_namespace_lock in the ZVOL code.
-	 * We are using the same trick as spa_open().
-	 * Note that calls in zvol_first_open which need to resolve
-	 * pool name to a spa object will enter spa_open()
-	 * recursively, but that function already has all the
-	 * necessary protection.
-	 */
-	if (!MUTEX_HELD(&spa_namespace_lock)) {
-		mutex_enter(&spa_namespace_lock);
-		locked = B_TRUE;
-	}
+	mutex_enter(&zfsdev_state_lock);
 
-	zv = pp->private;
+	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 	if (zv == NULL) {
-		if (locked)
-			mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 
@@ -927,10 +1123,41 @@
 	if (zv->zv_total_opens == 0)
 		err = zvol_first_open(zv);
 	if (err) {
-		if (locked)
-			mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (err);
 	}
+#else	/* !illumos */
+	if (tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+		/*
+		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
+		 * attempting to probe geom providers while looking for a
+		 * replacement for a missing VDEV.  In this case, the
+		 * spa_namespace_lock will not be held, but it is still illegal
+		 * to use a zvol as a vdev.  Deadlocks can result if another
+		 * thread has spa_namespace_lock
+		 */
+		return (EOPNOTSUPP);
+	}
+
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = pp->private;
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (zv->zv_total_opens == 0) {
+		err = zvol_first_open(zv);
+		if (err) {
+			mutex_exit(&zfsdev_state_lock);
+			return (err);
+		}
+		pp->mediasize = zv->zv_volsize;
+		pp->stripeoffset = 0;
+		pp->stripesize = zv->zv_volblocksize;
+	}
+#endif	/* illumos */
 	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 		err = SET_ERROR(EROFS);
 		goto out;
@@ -949,20 +1176,44 @@
 	}
 #endif
 
+#ifdef illumos
+	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+		zv->zv_open_count[otyp]++;
+		zv->zv_total_opens++;
+	}
+	mutex_exit(&zfsdev_state_lock);
+#else
 	zv->zv_total_opens += count;
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
+	mutex_exit(&zfsdev_state_lock);
+#endif
 
 	return (err);
 out:
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
+#ifdef illumos
+	mutex_exit(&zfsdev_state_lock);
+#else
+	mutex_exit(&zfsdev_state_lock);
+#endif
 	return (err);
 }
 
 /*ARGSUSED*/
+#ifdef illumos
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+	minor_t minor = getminor(dev);
+	zvol_state_t *zv;
+	int error = 0;
+
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+#else	/* !illumos */
 static int
 zvol_close(struct g_provider *pp, int flag, int count)
 {
@@ -971,8 +1222,8 @@
 	boolean_t locked = B_FALSE;
 
 	/* See comment in zvol_open(). */
-	if (!MUTEX_HELD(&spa_namespace_lock)) {
-		mutex_enter(&spa_namespace_lock);
+	if (!MUTEX_HELD(&zfsdev_state_lock)) {
+		mutex_enter(&zfsdev_state_lock);
 		locked = B_TRUE;
 	}
 
@@ -979,7 +1230,8 @@
 	zv = pp->private;
 	if (zv == NULL) {
 		if (locked)
-			mutex_exit(&spa_namespace_lock);
+			mutex_exit(&zfsdev_state_lock);
+#endif	/* illumos */
 		return (SET_ERROR(ENXIO));
 	}
 
@@ -992,18 +1244,30 @@
 	 * If the open count is zero, this is a spurious close.
 	 * That indicates a bug in the kernel / DDI framework.
 	 */
+#ifdef illumos
+	ASSERT(zv->zv_open_count[otyp] != 0);
+#endif
 	ASSERT(zv->zv_total_opens != 0);
 
 	/*
 	 * You may get multiple opens, but only one close.
 	 */
+#ifdef illumos
+	zv->zv_open_count[otyp]--;
+	zv->zv_total_opens--;
+#else
 	zv->zv_total_opens -= count;
+#endif
 
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 
+#ifdef illumos
+	mutex_exit(&zfsdev_state_lock);
+#else
 	if (locked)
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
+#endif
 	return (error);
 }
 
@@ -1042,7 +1306,6 @@
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
-	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
@@ -1051,12 +1314,22 @@
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
-	if (buf != NULL) {	/* immediate write */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+		    RL_READER);
 		error = dmu_read(os, object, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
-	} else {
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's written out
+		 * and its checksum is being calculated that no one can change
+		 * the data. Contrarily to zfs_get_data we need not re-check
+		 * blocksize after we get the lock because it cannot be changed.
+		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN(offset, size);
+		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+		    RL_READER);
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
@@ -1099,54 +1372,44 @@
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
-	boolean_t slogging;
-	ssize_t immediate_write_sz;
+	itx_wr_state_t write_state;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zvol_immediate_write_sz;
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (sync)
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
 
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
-		itx_wr_state_t write_state;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * Unlike zfs_log_write() we can be called with
-		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
-		 */
-		if (blocksize > immediate_write_sz && !slogging &&
-		    resid >= blocksize && off % blocksize == 0) {
-			write_state = WR_INDIRECT; /* uses dmu_sync */
-			len = blocksize;
-		} else if (sync) {
-			write_state = WR_COPIED;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		} else {
-			write_state = WR_NEED_COPY;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		}
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;
@@ -1154,8 +1417,10 @@
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
-		itx->itx_sync = sync;
 
+		if (!sync && (zv->zv_sync_cnt == 0))
+			itx->itx_sync = B_FALSE;
+
 		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
@@ -1163,29 +1428,30 @@
 	}
 }
 
-#ifdef sun
+#ifdef illumos
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-		    vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		int err = zvol_dumpio_vdev(vd->vdev_child[c],
-		    addr, offset, size, doread, isdump);
-		if (err != 0) {
-			numerrors++;
-		} else if (doread) {
-			break;
+	if (vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			int err = zvol_dumpio_vdev(vd->vdev_child[c],
+			    addr, offset, origoffset, size, doread, isdump);
+			if (err != 0) {
+				numerrors++;
+			} else if (doread) {
+				break;
+			}
 		}
 	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
@@ -1193,8 +1459,11 @@
 	else if (!doread && !vdev_writeable(vd))
 		return (SET_ERROR(EIO));
 
-	dvd = vd->vdev_tsd;
-	ASSERT3P(dvd, !=, NULL);
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		return (vdev_raidz_physio(vd,
+		    addr, size, offset, origoffset, doread, isdump));
+	}
+
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
@@ -1201,11 +1470,15 @@
 		ASSERT(!doread);
 		if (doread)
 			return (SET_ERROR(EIO));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
-		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-		    doread ? B_READ : B_WRITE));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
+		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
@@ -1240,7 +1513,8 @@
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+	    size, doread, isdump);
 
 	if (!ddi_in_panic())
 		spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1247,12 +1521,17 @@
 
 	return (error);
 }
-#endif	/* sun */
 
 int
+zvol_strategy(buf_t *bp)
+{
+	zfs_soft_state_t *zs = NULL;
+#else	/* !illumos */
+void
 zvol_strategy(struct bio *bp)
 {
-	zvol_state_t *zv = bp->bio_to->private;
+#endif	/* illumos */
+	zvol_state_t *zv;
 	uint64_t off, volsize;
 	size_t resid;
 	char *addr;
@@ -1259,34 +1538,105 @@
 	objset_t *os;
 	rl_t *rl;
 	int error = 0;
-	boolean_t doread = (bp->bio_cmd == BIO_READ);
+#ifdef illumos
+	boolean_t doread = bp->b_flags & B_READ;
+#else
+	boolean_t doread = 0;
+#endif
+	boolean_t is_dumpified;
 	boolean_t sync;
 
-	if (zv == NULL) {
-		g_io_deliver(bp, ENXIO);
+#ifdef illumos
+	if (getminor(bp->b_edev) == 0) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
+		if (zs == NULL)
+			error = SET_ERROR(ENXIO);
+		else if (zs->zss_type != ZSST_ZVOL)
+			error = SET_ERROR(EINVAL);
+	}
+
+	if (error) {
+		bioerror(bp, error);
+		biodone(bp);
 		return (0);
 	}
 
-	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
-		g_io_deliver(bp, EROFS);
+	zv = zs->zss_data;
+
+	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
+		bioerror(bp, EROFS);
+		biodone(bp);
 		return (0);
 	}
 
+	off = ldbtob(bp->b_blkno);
+#else	/* !illumos */
+	if (bp->bio_to)
+		zv = bp->bio_to->private;
+	else
+		zv = bp->bio_dev->si_drv2;
+
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		goto out;
+	}
+
+	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+		error = SET_ERROR(EROFS);
+		goto out;
+	}
+
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		goto sync;
+	case BIO_READ:
+		doread = 1;
+	case BIO_WRITE:
+	case BIO_DELETE:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		goto out;
+	}
+
 	off = bp->bio_offset;
+#endif	/* illumos */
 	volsize = zv->zv_volsize;
 
 	os = zv->zv_objset;
 	ASSERT(os != NULL);
 
+#ifdef illumos
+	bp_mapin(bp);
+	addr = bp->b_un.b_addr;
+	resid = bp->b_bcount;
+
+	if (resid > 0 && (off < 0 || off >= volsize)) {
+		bioerror(bp, EIO);
+		biodone(bp);
+		return (0);
+	}
+
+	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+	sync = ((!(bp->b_flags & B_ASYNC) &&
+	    !(zv->zv_flags & ZVOL_WCE)) ||
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
+	    !doread && !is_dumpified;
+#else	/* !illumos */
 	addr = bp->bio_data;
 	resid = bp->bio_length;
 
 	if (resid > 0 && (off < 0 || off >= volsize)) {
-		g_io_deliver(bp, EIO);
-		return (0);
+		error = SET_ERROR(EIO);
+		goto out;
 	}
 
-        sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+	is_dumpified = B_FALSE;
+	sync = !doread && !is_dumpified &&
+	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#endif	/* illumos */
 
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
@@ -1295,9 +1645,33 @@
 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
 	    doread ? RL_READER : RL_WRITER);
 
+#ifndef illumos
+	if (bp->bio_cmd == BIO_DELETE) {
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+		} else {
+			zvol_log_truncate(zv, tx, off, resid, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    off, resid);
+			resid = 0;
+		}
+		goto unlock;
+	}
+#endif
 	while (resid != 0 && off < volsize) {
 		size_t size = MIN(resid, zvol_maxphys);
+#ifdef illumos
+		if (is_dumpified) {
+			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
+			error = zvol_dumpio(zv, addr, off, size,
+			    doread, B_FALSE);
+		} else if (doread) {
+#else
 		if (doread) {
+#endif
 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
 			    DMU_READ_PREFETCH);
 		} else {
@@ -1322,20 +1696,38 @@
 		addr += size;
 		resid -= size;
 	}
+#ifndef illumos
+unlock:
+#endif
 	zfs_range_unlock(rl);
 
-	bp->bio_completed = bp->bio_length - resid;
-	if (bp->bio_completed < bp->bio_length)
-		bp->bio_error = (off > volsize ? EINVAL : error);
+#ifdef illumos
+	if ((bp->b_resid = resid) == bp->b_bcount)
+		bioerror(bp, off > volsize ? EINVAL : error);
 
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-	g_io_deliver(bp, 0);
+	biodone(bp);
 
 	return (0);
+#else	/* !illumos */
+	bp->bio_completed = bp->bio_length - resid;
+	if (bp->bio_completed < bp->bio_length && off > volsize)
+		error = EINVAL;
+
+	if (sync) {
+sync:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	}
+out:
+	if (bp->bio_to)
+		g_io_deliver(bp, error);
+	else
+		biofinish(bp, NULL, error);
+#endif	/* illumos */
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * Set the buffer count to the zvol maximum transfer.
  * Using our own routine instead of the default minphys()
@@ -1391,25 +1783,37 @@
 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
+#else	/* !illumos */
+int
+zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif	/* illumos */
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
 
+#ifdef illumos
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
+#else
+	zv = dev->si_drv2;
+#endif
 
 	volsize = zv->zv_volsize;
+	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
 		return (SET_ERROR(EIO));
 
+#ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_READ,
 		    zvol_minphys, uio);
 		return (error);
 	}
+#endif
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
@@ -1420,7 +1824,7 @@
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
-		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
@@ -1432,11 +1836,17 @@
 	return (error);
 }
 
+#ifdef illumos
 /*ARGSUSED*/
 int
 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
+#else	/* !illumos */
+int
+zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif	/* illumos */
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
@@ -1443,15 +1853,21 @@
 	int error = 0;
 	boolean_t sync;
 
+#ifdef illumos
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
 		return (SET_ERROR(ENXIO));
+#else
+	zv = dev->si_drv2;
+#endif
 
 	volsize = zv->zv_volsize;
+	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
 		return (SET_ERROR(EIO));
 
+#ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_WRITE,
 		    zvol_minphys, uio);
@@ -1459,6 +1875,9 @@
 	}
 
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
+#else
+	sync = (ioflag & IO_SYNC) ||
+#endif
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
@@ -1491,6 +1910,7 @@
 	return (error);
 }
 
+#ifdef illumos
 int
 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
 {
@@ -1619,9 +2039,36 @@
 /*
  * END entry points to allow external callers access to the volume.
  */
+#endif	/* illumos */
 
 /*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+static void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+    boolean_t sync)
+{
+	itx_t *itx;
+	lr_truncate_t *lr;
+	zilog_t *zilog = zv->zv_zilog;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = ZVOL_OBJ;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+#ifdef illumos
+/*
  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
+ * Also a dirtbag dkio ioctl for unmap/free-block functionality.
  */
 /*ARGSUSED*/
 int
@@ -1628,18 +2075,16 @@
 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {
 	zvol_state_t *zv;
-	struct dk_cinfo dki;
-	struct dk_minfo dkm;
 	struct dk_callback *dkc;
 	int error = 0;
 	rl_t *rl;
 
-	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&zfsdev_state_lock);
 
 	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
 
 	if (zv == NULL) {
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (SET_ERROR(ENXIO));
 	}
 	ASSERT(zv->zv_total_opens > 0);
@@ -1647,40 +2092,64 @@
 	switch (cmd) {
 
 	case DKIOCINFO:
+	{
+		struct dk_cinfo dki;
+
 		bzero(&dki, sizeof (dki));
 		(void) strcpy(dki.dki_cname, "zvol");
 		(void) strcpy(dki.dki_dname, "zvol");
 		dki.dki_ctype = DKC_UNKNOWN;
 		dki.dki_unit = getminor(dev);
-		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
-		mutex_exit(&spa_namespace_lock);
+		dki.dki_maxtransfer =
+		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
+		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
 			error = SET_ERROR(EFAULT);
 		return (error);
+	}
 
 	case DKIOCGMEDIAINFO:
+	{
+		struct dk_minfo dkm;
+
 		bzero(&dkm, sizeof (dkm));
 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
 		dkm.dki_media_type = DK_UNKNOWN;
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
 			error = SET_ERROR(EFAULT);
 		return (error);
+	}
 
+	case DKIOCGMEDIAINFOEXT:
+	{
+		struct dk_minfo_ext dkmext;
+
+		bzero(&dkmext, sizeof (dkmext));
+		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
+		dkmext.dki_pbsize = zv->zv_volblocksize;
+		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+		dkmext.dki_media_type = DK_UNKNOWN;
+		mutex_exit(&zfsdev_state_lock);
+		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
+			error = SET_ERROR(EFAULT);
+		return (error);
+	}
+
 	case DKIOCGETEFI:
-		{
-			uint64_t vs = zv->zv_volsize;
-			uint8_t bs = zv->zv_min_bs;
+	{
+		uint64_t vs = zv->zv_volsize;
+		uint8_t bs = zv->zv_min_bs;
 
-			mutex_exit(&spa_namespace_lock);
-			error = zvol_getefi((void *)arg, flag, vs, bs);
-			return (error);
-		}
+		mutex_exit(&zfsdev_state_lock);
+		error = zvol_getefi((void *)arg, flag, vs, bs);
+		return (error);
+	}
 
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
-		mutex_exit(&spa_namespace_lock);
+		mutex_exit(&zfsdev_state_lock);
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
@@ -1689,31 +2158,31 @@
 		return (error);
 
 	case DKIOCGETWCE:
-		{
-			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
-			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
-			    flag))
-				error = SET_ERROR(EFAULT);
+	{
+		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+		    flag))
+			error = SET_ERROR(EFAULT);
+		break;
+	}
+	case DKIOCSETWCE:
+	{
+		int wce;
+		if (ddi_copyin((void *)arg, &wce, sizeof (int),
+		    flag)) {
+			error = SET_ERROR(EFAULT);
 			break;
 		}
-	case DKIOCSETWCE:
-		{
-			int wce;
-			if (ddi_copyin((void *)arg, &wce, sizeof (int),
-			    flag)) {
-				error = SET_ERROR(EFAULT);
-				break;
-			}
-			if (wce) {
-				zv->zv_flags |= ZVOL_WCE;
-				mutex_exit(&spa_namespace_lock);
-			} else {
-				zv->zv_flags &= ~ZVOL_WCE;
-				mutex_exit(&spa_namespace_lock);
-				zil_commit(zv->zv_zilog, ZVOL_OBJ);
-			}
-			return (0);
+		if (wce) {
+			zv->zv_flags |= ZVOL_WCE;
+			mutex_exit(&zfsdev_state_lock);
+		} else {
+			zv->zv_flags &= ~ZVOL_WCE;
+			mutex_exit(&zfsdev_state_lock);
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		}
+		return (0);
+	}
 
 	case DKIOCGGEOM:
 	case DKIOCGVTOC:
@@ -1745,6 +2214,9 @@
 		dkioc_free_t df;
 		dmu_tx_t *tx;
 
+		if (!zvol_unmap_enabled)
+			break;
+
 		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
 			error = SET_ERROR(EFAULT);
 			break;
@@ -1757,12 +2229,13 @@
 		 */
 		if (df.df_start >= zv->zv_volsize)
 			break;	/* No need to do anything... */
-		if (df.df_start + df.df_length > zv->zv_volsize)
-			df.df_length = DMU_OBJECT_END;
 
+		mutex_exit(&zfsdev_state_lock);
+
 		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
 		    RL_WRITER);
 		tx = dmu_tx_create(zv->zv_objset);
+		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
@@ -1796,7 +2269,7 @@
 				    dmu_objset_pool(zv->zv_objset), 0);
 			}
 		}
-		break;
+		return (error);
 	}
 
 	default:
@@ -1804,10 +2277,10 @@
 		break;
 
 	}
-	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (error);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 int
 zvol_busy(void)
@@ -1820,37 +2293,117 @@
 {
 	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
 	    1) == 0);
+#ifdef illumos
+	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+#else
 	ZFS_LOG(1, "ZVOL Initialized.");
+#endif
 }
 
 void
 zvol_fini(void)
 {
+#ifdef illumos
+	mutex_destroy(&zfsdev_state_lock);
+#endif
 	ddi_soft_state_fini(&zfsdev_state);
 	ZFS_LOG(1, "ZVOL Deinitialized.");
 }
 
-#ifdef sun
+#ifdef illumos
+/*ARGSUSED*/
 static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+		return (1);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
+}
+
+static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
 	dmu_tx_t *tx;
-	int error = 0;
+	int error;
 	objset_t *os = zv->zv_objset;
+	spa_t *spa = dmu_objset_spa(os);
+	vdev_t *vd = spa->spa_root_vdev;
 	nvlist_t *nv = NULL;
-	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+	uint64_t version = spa_version(spa);
+	uint64_t checksum, compress, refresrv, vbs, dedup;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+	ASSERT(vd->vdev_ops == &vdev_root_ops);
+
 	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
 	    DMU_OBJECT_END);
+	if (error != 0)
+		return (error);
 	/* wait for dmu_free_long_range to actually free the blocks */
 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
+	/*
+	 * If the pool on which the dump device is being initialized has more
+	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+	 * enabled.  If so, bump that feature's counter to indicate that the
+	 * feature is active. We also check the vdev type to handle the
+	 * following case:
+	 *   # zpool create test raidz disk1 disk2 disk3
+	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+	 *   the raidz vdev itself has 3 children.
+	 */
+	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+		if (!spa_feature_is_enabled(spa,
+		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+			return (SET_ERROR(ENOTSUP));
+		(void) dsl_sync_task(spa_name(spa),
+		    zfs_mvdev_dump_feature_check,
+		    zfs_mvdev_dump_activate_feature_sync, NULL,
+		    2, ZFS_SPACE_CHECK_RESERVED);
+	}
+
+	if (!resize) {
+		error = dsl_prop_get_integer(zv->zv_name,
+		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
+			    NULL);
+		}
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &refresrv, NULL);
+		}
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
+			    NULL);
+		}
+		if (version >= SPA_VERSION_DEDUP && error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+		}
+	}
+	if (error != 0)
+		return (error);
+
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
+	if (error != 0) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
@@ -1866,42 +2419,35 @@
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
 		    &zv->zv_volsize, tx);
 	} else {
-		uint64_t checksum, compress, refresrv, vbs, dedup;
-
-		error = dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
-		if (version >= SPA_VERSION_DEDUP) {
-			error = error ? error :
-			    dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
-		}
-
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		error = zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
 		    &compress, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
-		    &refresrv, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
-		    &vbs, tx);
-		error = error ? error : dmu_object_set_blocksize(
-		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
-		if (version >= SPA_VERSION_DEDUP) {
-			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
+			    &checksum, tx);
+		}
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+			    &refresrv, tx);
+		}
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+			    &vbs, tx);
+		}
+		if (error == 0) {
+			error = dmu_object_set_blocksize(
+			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
+		}
+		if (version >= SPA_VERSION_DEDUP && error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
 			    &dedup, tx);
 		}
 		if (error == 0)
-			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
 	}
 	dmu_tx_commit(tx);
 
@@ -1909,7 +2455,15 @@
 	 * We only need update the zvol's property if we are initializing
 	 * the dump area for the first time.
 	 */
-	if (!resize) {
+	if (error == 0 && !resize) {
+		/*
+		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+		 * function.  Otherwise, use the old default -- OFF.
+		 */
+		checksum = spa_feature_is_active(spa,
+		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
+		    ZIO_CHECKSUM_OFF;
+
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
@@ -1918,7 +2472,7 @@
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    ZIO_CHECKSUM_OFF) == 0);
+		    checksum) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
@@ -1928,13 +2482,11 @@
 		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
 		    nv, NULL);
 		nvlist_free(nv);
-
-		if (error)
-			return (error);
 	}
 
 	/* Allocate the space for the dump */
-	error = zvol_prealloc(zv);
+	if (error == 0)
+		error = zvol_prealloc(zv);
 	return (error);
 }
 
@@ -2062,32 +2614,8 @@
 
 	return (0);
 }
-#endif	/* sun */
+#else	/* !illumos */
 
-static zvol_state_t *
-zvol_geom_create(const char *name)
-{
-	struct g_provider *pp;
-	struct g_geom *gp;
-	zvol_state_t *zv;
-
-	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
-	gp->start = zvol_geom_start;
-	gp->access = zvol_geom_access;
-	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
-	pp->sectorsize = DEV_BSIZE;
-
-	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
-	zv->zv_provider = pp;
-	zv->zv_state = 0;
-	bioq_init(&zv->zv_queue);
-	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
-
-	pp->private = zv;
-
-	return (zv);
-}
-
 static void
 zvol_geom_run(zvol_state_t *zv)
 {
@@ -2118,8 +2646,6 @@
 	zv->zv_provider = NULL;
 	pp->private = NULL;
 	g_wither_geom(pp->geom, ENXIO);
-
-	kmem_free(zv, sizeof(*zv));
 }
 
 static int
@@ -2178,25 +2704,67 @@
 	zvol_state_t *zv;
 	boolean_t first;
 
+	zv = bp->bio_to->private;
+	ASSERT(zv != NULL);
 	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		if (!THREAD_CAN_SLEEP())
+			goto enqueue;
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		g_io_deliver(bp, 0);
+		break;
 	case BIO_READ:
 	case BIO_WRITE:
-	case BIO_FLUSH:
-		zv = bp->bio_to->private;
-		ASSERT(zv != NULL);
-		mtx_lock(&zv->zv_queue_mtx);
-		first = (bioq_first(&zv->zv_queue) == NULL);
-		bioq_insert_tail(&zv->zv_queue, bp);
-		mtx_unlock(&zv->zv_queue_mtx);
-		if (first)
-			wakeup_one(&zv->zv_queue);
+	case BIO_DELETE:
+		if (!THREAD_CAN_SLEEP())
+			goto enqueue;
+		zvol_strategy(bp);
 		break;
-	case BIO_GETATTR:
-	case BIO_DELETE:
+	case BIO_GETATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		uint64_t refd, avail, usedobjs, availobjs, val;
+
+		if (g_handleattr_int(bp, "GEOM::candelete", 1))
+			return;
+		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			if (g_handleattr_off_t(bp, "blocksavail",
+			    avail / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			if (g_handleattr_off_t(bp, "blocksused",
+			    refd / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+			if (g_handleattr_off_t(bp, "poolblocksavail",
+			    avail / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			if (g_handleattr_off_t(bp, "poolblocksused",
+			    refd / DEV_BSIZE))
+				return;
+		}
+		/* FALLTHROUGH */
+	}
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		break;
 	}
+	return;
+
+enqueue:
+	mtx_lock(&zv->zv_queue_mtx);
+	first = (bioq_first(&zv->zv_queue) == NULL);
+	bioq_insert_tail(&zv->zv_queue, bp);
+	mtx_unlock(&zv->zv_queue_mtx);
+	if (first)
+		wakeup_one(&zv->zv_queue);
 }
 
 static void
@@ -2232,8 +2800,12 @@
 			break;
 		case BIO_READ:
 		case BIO_WRITE:
+		case BIO_DELETE:
 			zvol_strategy(bp);
 			break;
+		default:
+			g_io_deliver(bp, EOPNOTSUPP);
+			break;
 		}
 	}
 }
@@ -2273,7 +2845,8 @@
 			break;
 		}
 
-		if ((error = zvol_create_minor(sname)) != 0) {
+		error = zvol_create_minor(sname);
+		if (error != 0 && error != EEXIST) {
 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
 			    sname, error);
 			break;
@@ -2303,9 +2876,10 @@
 	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
 		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
-		if ((error = zvol_create_minor(name)) == 0)
+		error = zvol_create_minor(name);
+		if (error == 0 || error == EEXIST) {
 			error = zvol_create_snapshots(os, name);
-		else {
+		} else {
 			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
 			    name, error);
 		}
@@ -2354,29 +2928,58 @@
 }
 
 static void
-zvol_rename_minor(struct g_geom *gp, const char *newname)
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
 {
+	struct g_geom *gp;
 	struct g_provider *pp;
-	zvol_state_t *zv;
+	struct cdev *dev;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	g_topology_assert();
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
-	pp = LIST_FIRST(&gp->provider);
-	ASSERT(pp != NULL);
-	zv = pp->private;
-	ASSERT(zv != NULL);
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		pp = zv->zv_provider;
+		ASSERT(pp != NULL);
+		gp = pp->geom;
+		ASSERT(gp != NULL);
 
-	zv->zv_provider = NULL;
-	g_wither_provider(pp, ENXIO);
+		zv->zv_provider = NULL;
+		g_wither_provider(pp, ENXIO);
 
-	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
-	pp->sectorsize = DEV_BSIZE;
-	pp->mediasize = zv->zv_volsize;
-	pp->private = zv;
-	zv->zv_provider = pp;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = zv->zv_volsize;
+		pp->private = zv;
+		zv->zv_provider = pp;
+		g_error_provider(pp, 0);
+		g_topology_unlock();
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct make_dev_args args;
+
+		if ((dev = zv->zv_dev) != NULL) {
+			zv->zv_dev = NULL;
+			destroy_dev(dev);
+			if (zv->zv_total_opens > 0) {
+				zv->zv_flags &= ~ZVOL_EXCL;
+				zv->zv_total_opens = 0;
+				zvol_last_close(zv);
+			}
+		}
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		if (make_dev_s(&args, &zv->zv_dev,
+		    "%s/%s", ZVOL_DRIVER, newname) == 0)
+			zv->zv_dev->si_iosize_max = MAXPHYS;
+	}
 	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
-	g_error_provider(pp, 0);
 }
 
 void
@@ -2388,23 +2991,21 @@
 	size_t oldnamelen, newnamelen;
 	zvol_state_t *zv;
 	char *namebuf;
+	boolean_t locked = B_FALSE;
 
 	oldnamelen = strlen(oldname);
 	newnamelen = strlen(newname);
 
 	DROP_GIANT();
-	mutex_enter(&spa_namespace_lock);
-	g_topology_lock();
+	/* See comment in zvol_open(). */
+	if (!MUTEX_HELD(&zfsdev_state_lock)) {
+		mutex_enter(&zfsdev_state_lock);
+		locked = B_TRUE;
+	}
 
-	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
-		pp = LIST_FIRST(&gp->provider);
-		if (pp == NULL)
-			continue;
-		zv = pp->private;
-		if (zv == NULL)
-			continue;
+	LIST_FOREACH(zv, &all_zvols, zv_links) {
 		if (strcmp(zv->zv_name, oldname) == 0) {
-			zvol_rename_minor(gp, newname);
+			zvol_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		     zv->zv_name[oldnamelen] == '@')) {
@@ -2411,11 +3012,198 @@
 			snprintf(name, sizeof(name), "%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
-			zvol_rename_minor(gp, name);
+			zvol_rename_minor(zv, name);
 		}
 	}
 
-	g_topology_unlock();
-	mutex_exit(&spa_namespace_lock);
+	if (locked)
+		mutex_exit(&zfsdev_state_lock);
 	PICKUP_GIANT();
 }
+
+static int
+zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv = dev->si_drv2;
+	int err = 0;
+
+	mutex_enter(&zfsdev_state_lock);
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
+		mutex_exit(&zfsdev_state_lock);
+		return (err);
+	}
+	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = SET_ERROR(EROFS);
+		goto out;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = SET_ERROR(EBUSY);
+		goto out;
+	}
+#ifdef FEXCL
+	if (flags & FEXCL) {
+		if (zv->zv_total_opens != 0) {
+			err = SET_ERROR(EBUSY);
+			goto out;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_total_opens++;
+	if (flags & (FSYNC | FDSYNC)) {
+		zv->zv_sync_cnt++;
+		if (zv->zv_sync_cnt == 1)
+			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+	}
+	mutex_exit(&zfsdev_state_lock);
+	return (err);
+out:
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+	mutex_exit(&zfsdev_state_lock);
+	return (err);
+}
+
+static int
+zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv = dev->si_drv2;
+
+	mutex_enter(&zfsdev_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT(zv->zv_total_opens == 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_total_opens != 0);
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_total_opens--;
+	if (flags & (FSYNC | FDSYNC))
+		zv->zv_sync_cnt--;
+
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+
+	mutex_exit(&zfsdev_state_lock);
+	return (0);
+}
+
+static int
+zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+	zvol_state_t *zv;
+	rl_t *rl;
+	off_t offset, length;
+	int i, error;
+	boolean_t sync;
+
+	zv = dev->si_drv2;
+
+	error = 0;
+	KASSERT(zv->zv_total_opens > 0,
+	    ("Device with zero access count in zvol_d_ioctl"));
+
+	i = IOCPARM_LEN(cmd);
+	switch (cmd) {
+	case DIOCGSECTORSIZE:
+		*(u_int *)data = DEV_BSIZE;
+		break;
+	case DIOCGMEDIASIZE:
+		*(off_t *)data = zv->zv_volsize;
+		break;
+	case DIOCGFLUSH:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		break;
+	case DIOCGDELETE:
+		if (!zvol_unmap_enabled)
+			break;
+
+		offset = ((off_t *)data)[0];
+		length = ((off_t *)data)[1];
+		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+		    offset < 0 || offset >= zv->zv_volsize ||
+		    length <= 0) {
+			printf("%s: offset=%jd length=%jd\n", __func__, offset,
+			    length);
+			error = EINVAL;
+			break;
+		}
+
+		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			sync = FALSE;
+			dmu_tx_abort(tx);
+		} else {
+			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+			zvol_log_truncate(zv, tx, offset, length, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    offset, length);
+		}
+		zfs_range_unlock(rl);
+		if (sync)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		break;
+	case DIOCGSTRIPESIZE:
+		*(off_t *)data = zv->zv_volblocksize;
+		break;
+	case DIOCGSTRIPEOFFSET:
+		*(off_t *)data = 0;
+		break;
+	case DIOCGATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+		uint64_t refd, avail, usedobjs, availobjs;
+
+		if (strcmp(arg->name, "GEOM::candelete") == 0)
+			arg->value.i = 1;
+		else if (strcmp(arg->name, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = refd / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = refd / DEV_BSIZE;
+		} else
+			error = ENOIOCTL;
+		break;
+	}
+	case FIOSEEKHOLE:
+	case FIOSEEKDATA: {
+		off_t *off = (off_t *)data;
+		uint64_t noff;
+		boolean_t hole;
+
+		hole = (cmd == FIOSEEKHOLE);
+		noff = *off;
+		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+		*off = noff;
+		break;
+	}
+	default:
+		error = ENOIOCTL;
+	}
+
+	return (error);
+}
+#endif	/* illumos */

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -368,7 +369,7 @@
 	mutex_exit(&ct->ct_lock);
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * Return a boolean value indicating whether a particular kernel thread is
  * stopped in accordance with the cpr callback protocol.  If returning
@@ -432,7 +433,7 @@
 	mutex_exit(&ct->ct_lock);
 	return (ret_val);
 }
-#endif	/* sun */
+#endif	/* illumos */
 
 SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
 SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -79,7 +80,7 @@
 static const char *fm_msgid = "SUNOS-8000-0G";
 static char *volatile fm_panicstr = NULL;
 
-#ifdef sun
+#ifdef illumos
 errorq_t *ereport_errorq;
 #endif
 void *ereport_dumpbuf;
@@ -112,7 +113,7 @@
 	{ "payload-set-failed", KSTAT_DATA_UINT64 }
 };
 
-#ifdef sun
+#ifdef illumos
 /*ARGSUSED*/
 static void
 fm_drain(void *private, void *data, errorq_elem_t *eep)
@@ -131,7 +132,7 @@
 {
 	kstat_t *ksp;
 
-#ifdef sun
+#ifdef illumos
 	(void) sysevent_evc_bind(FM_ERROR_CHAN,
 	    &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
 
@@ -145,7 +146,7 @@
 	if (ereport_size == 0)
 		ereport_size = ERPT_DATA_SZ;
 
-#ifdef sun
+#ifdef illumos
 	ereport_errorq = errorq_nvcreate("fm_ereport_queue",
 	    (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
 	    FM_ERR_PIL, ERRORQ_VITAL);
@@ -170,7 +171,7 @@
 	}
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
  * output so they aren't split across console lines, and return the end column.
@@ -379,7 +380,7 @@
 {
 	va_list ap;
 
-	(void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+	(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
 #if defined(__i386) || defined(__amd64)
 	fastreboot_disable_highpil();
 #endif /* __i386 || __amd64 */
@@ -524,20 +525,20 @@
 
 	(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		return;
 	}
 
-#ifdef sun
+#ifdef illumos
 	if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
 	    EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		return;
 	}
 
 	if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
 	    SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		(void) sysevent_evc_unbind(error_chan);
 		return;
 	}
@@ -803,8 +804,7 @@
 	va_end(ap);
 
 	if (ret)
-		atomic_add_64(
-		    &erpt_kstat_data.payload_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
 }
 
 /*
@@ -837,7 +837,7 @@
 	int ret;
 
 	if (version != FM_EREPORT_VERS0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 		return;
 	}
 
@@ -844,17 +844,17 @@
 	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
 	    FM_EREPORT_CLASS, erpt_class);
 	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 	}
 
 	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
 	    (nvlist_t *)detector) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 	}
 
 	va_start(ap, detector);
@@ -863,7 +863,7 @@
 	va_end(ap);
 
 	if (ret)
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 }
 
 /*
@@ -886,19 +886,19 @@
 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
 {
 	if (version != FM_HC_SCHEME_VERSION) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
 	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
 	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
 	    (nvlist_t *)auth) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
@@ -930,14 +930,14 @@
 		pairs[i] = fm_nvlist_create(nva);
 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 	va_end(ap);
 
 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 
 	for (i = 0; i < npairs; i++)
 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
@@ -944,8 +944,8 @@
 
 	if (snvl != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
@@ -970,7 +970,7 @@
 	int err = 0;
 
 	if (version != DEV_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
@@ -991,7 +991,7 @@
 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
 
 	if (err)
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 
 }
 
@@ -1016,35 +1016,35 @@
 	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
 
 	if (version < CPU_SCHEME_VERSION1) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
 	    FM_FMRI_SCHEME_CPU) != 0) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
 	    (nvlist_t *)auth) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
 	    *cpu_maskp) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
 	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
-			atomic_add_64(failedp, 1);
+			atomic_inc_64(failedp);
 }
 
 /*
@@ -1065,22 +1065,22 @@
     const char *unum, const char *serial, uint64_t offset)
 {
 	if (version != MEM_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (!serial && (offset != (uint64_t)-1)) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
@@ -1087,27 +1087,25 @@
 	if (auth != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
 		    (nvlist_t *)auth) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 	}
 
 	if (serial != NULL) {
 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
 		    (char **)&serial, 1) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
-		if (offset != (uint64_t)-1) {
-			if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET,
-			    offset) != 0) {
-				atomic_add_64(&erpt_kstat_data.
-				    fmri_set_failed.value.ui64, 1);
-			}
+		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+		    FM_FMRI_MEM_OFFSET, offset) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
@@ -1117,28 +1115,28 @@
     uint64_t vdev_guid)
 {
 	if (version != ZFS_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 	}
 
 	if (vdev_guid != 0) {
 		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
@@ -1265,7 +1263,7 @@
 	return (time);
 }
 
-#ifdef sun
+#ifdef illumos
 /*
  * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
  * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
@@ -1293,7 +1291,7 @@
 }
 #endif
 
-#ifdef sun
+#ifdef illumos
 void
 print_msg_hwerr(ctid_t ct_id, proc_t *p)
 {
@@ -1322,7 +1320,7 @@
 	 */
 	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
 	    != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
@@ -1329,13 +1327,13 @@
 	for (i = 0; i < n; i++) {
 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
 		    &hcname) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 
@@ -1347,8 +1345,8 @@
 					fm_nvlist_destroy(pairs[j],
 					    FM_NVA_RETAIN);
 			}
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}
@@ -1372,8 +1370,8 @@
 					fm_nvlist_destroy(pairs[j],
 					    FM_NVA_RETAIN);
 			}
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}
@@ -1384,7 +1382,7 @@
 	 */
 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
 	    npairs + n) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
@@ -1394,8 +1392,8 @@
 
 	if (snvl != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -19,6 +20,8 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -187,11 +190,12 @@
     ACE_DIRECTORY_INHERIT_ACE | \
     ACE_NO_PROPAGATE_INHERIT_ACE | \
     ACE_INHERIT_ONLY_ACE | \
+    ACE_INHERITED_ACE | \
     ACE_IDENTIFIER_GROUP)
 
 #define	ACE_TYPE_FLAGS		(ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \
     ACE_IDENTIFIER_GROUP)
-#define	ACE_INHERIT_FLAGS	(ACE_FILE_INHERIT_ACE| \
+#define	ACE_INHERIT_FLAGS	(ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \
     ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
 
 /* cmd args to acl(2) for aclent_t  */
@@ -299,13 +303,8 @@
 
 #endif	/* !defined(_KERNEL) */
 
-#if defined(__STDC__)
 extern int acl(const char *path, int cmd, int cnt, void *buf);
 extern int facl(int fd, int cmd, int cnt, void *buf);
-#else	/* !__STDC__ */
-extern int acl();
-extern int facl();
-#endif	/* defined(__STDC__) */
 
 #ifdef	__cplusplus
 }

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -23,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef	_AVL_H
 #define	_AVL_H
 
@@ -39,7 +44,7 @@
 #include <sys/avl_impl.h>
 
 /*
- * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
  * The interfaces provide an efficient way of implementing an ordered set of
  * data structures.
  *
@@ -175,7 +180,7 @@
  * Insert "new_data" in "tree" in the given "direction" either after
  * or before the data "here".
  *
- * This might be usefull for avl clients caching recently accessed
+ * This might be useful for avl clients caching recently accessed
  * data to avoid doing avl_find() again for insertion.
  *
  * new_data	- new data to insert
@@ -260,6 +265,11 @@
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
 /*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
  * Return the number of nodes in the tree
  */
 extern ulong_t avl_numnodes(avl_tree_t *tree);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,6 +25,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -31,8 +36,6 @@
 #ifndef _SYS_BITMAP_H
 #define	_SYS_BITMAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -152,6 +155,7 @@
  * Low order bit is 0, high order bit is 31.
  */
 extern int	highbit(ulong_t);
+extern int	highbit64(uint64_t);
 extern int	lowbit(ulong_t);
 extern int	bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
 extern void	bt_copy(ulong_t *, ulong_t *, ulong_t);
@@ -168,9 +172,9 @@
  * to 0 otherwise.
  */
 #define	BT_ATOMIC_SET(bitmap, bitindex) \
-	{ atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+	{ atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
 #define	BT_ATOMIC_CLEAR(bitmap, bitindex) \
-	{ atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+	{ atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
 
 #define	BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
 	{ result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)),	\

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,6 +22,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Igor Kozhukhov <ikozhukhov at gmail.com>.
  */
 
 #ifndef _SYS_CPUVAR_H
@@ -31,6 +33,7 @@
 #include <sys/disp.h>
 #include <sys/processor.h>
 
+#include <sys/loadavg.h>
 #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
 #include <sys/machcpuvar.h>
 #endif
@@ -52,16 +55,7 @@
 struct squeue_set_s;
 
 #define	CPU_CACHE_COHERENCE_SIZE	64
-#define	S_LOADAVG_SZ	11
-#define	S_MOVAVG_SZ	10
 
-struct loadavg_s {
-	int lg_cur;		/* current loadavg entry */
-	unsigned int lg_len;	/* number entries recorded */
-	hrtime_t lg_total;	/* used to temporarily hold load totals */
-	hrtime_t lg_loads[S_LOADAVG_SZ];	/* table of recorded entries */
-};
-
 /*
  * For fast event tracing.
  */
@@ -524,8 +518,8 @@
 	largest = (uint_t)(highbit(set) - 1);		\
 }
 
-#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_long(&(set), ~CPUSET(cpu))
-#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_long(&(set), CPUSET(cpu))
+#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_ulong(&(set), ~CPUSET(cpu))
+#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_ulong(&(set), CPUSET(cpu))
 
 #define	CPUSET_ATOMIC_XADD(set, cpu, result) \
 	{ result = atomic_set_long_excl(&(set), (cpu)); }
@@ -655,7 +649,7 @@
 
 void	mach_cpu_pause(volatile char *);
 
-void	pause_cpus(cpu_t *off_cp);
+void	pause_cpus(cpu_t *off_cp, void *(*func)(void *));
 void	start_cpus(void);
 int	cpus_paused(void);
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -27,7 +28,7 @@
 #ifndef	_CTF_H
 #define	_CTF_H
 
-#if defined(sun)
+#ifdef illumos
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 #endif
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -23,6 +24,9 @@
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ */
 
 /*
  * This header file defines the interfaces available from the CTF debugger
@@ -40,8 +44,6 @@
 #ifndef	_CTF_API_H
 #define	_CTF_API_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/elf.h>
@@ -68,7 +70,7 @@
 	const char *cts_name;	/* section name (if any) */
 	ulong_t cts_type;	/* section type (ELF SHT_... value) */
 	ulong_t cts_flags;	/* section flags (ELF SHF_... value) */
-#if defined(sun)
+#ifdef illumos
 	const void *cts_data;	/* pointer to section data */
 #else
 	void *cts_data;		/* pointer to section data */
@@ -154,6 +156,7 @@
 extern ctf_file_t *ctf_fdopen(int, int *);
 extern ctf_file_t *ctf_open(const char *, int *);
 extern ctf_file_t *ctf_create(int *);
+extern ctf_file_t *ctf_dup(ctf_file_t *);
 extern void ctf_close(ctf_file_t *);
 
 extern ctf_file_t *ctf_parent_file(ctf_file_t *);
@@ -179,6 +182,8 @@
 extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t);
 extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t);
 extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t);
+extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t,
+    const char *);
 extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t);
 extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t);
 extern int ctf_type_kind(ctf_file_t *, ctf_id_t);
@@ -227,6 +232,8 @@
 
 extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *);
 
+extern int ctf_delete_type(ctf_file_t *, ctf_id_t);
+
 extern int ctf_update(ctf_file_t *);
 extern int ctf_discard(ctf_file_t *);
 extern int ctf_write(ctf_file_t *, int);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -19,6 +20,8 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -25,6 +28,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -47,7 +51,6 @@
  * ASSERT and is evaluated on both debug and non-debug kernels.
  */
 
-#if defined(__STDC__)
 extern int assfail(const char *, const char *, int);
 #define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #ifdef DEBUG
@@ -55,15 +58,6 @@
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
-#else	/* defined(__STDC__) */
-extern int assfail();
-#define	VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#ifdef DEBUG
-#define	ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#else
-#define	ASSERT(x)  ((void)0)
-#endif
-#endif	/* defined(__STDC__) */
 
 /*
  * Assertion variants sensitive to the compilation data model
@@ -131,6 +125,16 @@
 #define	ASSERT0(x)		((void)0)
 #endif
 
+/*
+ * Compile-time assertion. The condition 'x' must be constant.
+ */
+#ifndef CTASSERT
+#define	CTASSERT(x)		_CTASSERT(x, __LINE__)
+#define	_CTASSERT(x, y)		__CTASSERT(x, y)
+#define	__CTASSERT(x, y) \
+	typedef char __compile_time_assertion__ ## y [(x) ? 1 : -1]
+#endif
+
 #ifdef	_KERNEL
 
 extern void abort_sequence_enter(char *);

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -25,14 +26,13 @@
  */
 
 /*
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DTRACE_H
 #define	_SYS_DTRACE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -53,9 +53,10 @@
 #include <sys/types.h>
 #include <sys/modctl.h>
 #include <sys/processor.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/systm.h>
 #else
+#include <sys/cpuvar.h>
 #include <sys/param.h>
 #include <sys/linker.h>
 #include <sys/ioccom.h>
@@ -63,8 +64,8 @@
 typedef int model_t;
 #endif
 #include <sys/ctf_api.h>
+#ifdef illumos
 #include <sys/cyclic.h>
-#if defined(sun)
 #include <sys/int_limits.h>
 #else
 #include <sys/stdint.h>
@@ -255,7 +256,7 @@
 #define	DIF_VAR_ERRNO		0x0120	/* thread errno */
 #define	DIF_VAR_EXECARGS	0x0121	/* process arguments */
 
-#if !defined(sun)
+#ifndef illumos
 #define	DIF_VAR_CPU		0x0200
 #endif
 
@@ -310,9 +311,12 @@
 #define	DIF_SUBR_SX_SHARED_HELD		48
 #define	DIF_SUBR_SX_EXCLUSIVE_HELD	49
 #define	DIF_SUBR_SX_ISEXCLUSIVE		50
+#define	DIF_SUBR_MEMSTR			51
+#define	DIF_SUBR_GETF			52
+#define	DIF_SUBR_JSON			53
+#define	DIF_SUBR_STRTOLL		54
+#define	DIF_SUBR_MAX			54	/* max subroutine value */
 
-#define	DIF_SUBR_MAX			50	/* max subroutine value */
-
 typedef uint32_t dif_instr_t;
 
 #define	DIF_INSTR_OP(i)			(((i) >> 24) & 0xff)
@@ -373,6 +377,7 @@
 #define	DIF_TYPE_STRING		1	/* type is a D string */
 
 #define	DIF_TF_BYREF		0x1	/* type is passed by reference */
+#define	DIF_TF_BYUREF		0x2	/* user type is passed by reference */
 
 /*
  * A DTrace Intermediate Format variable record is used to describe each of the
@@ -722,6 +727,20 @@
 
 #define	DOF_SECF_LOAD		1	/* section should be loaded */
 
+#define	DOF_SEC_ISLOADABLE(x)						\
+	(((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) ||	\
+	((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) ||	\
+	((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) ||		\
+	((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) ||		\
+	((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) ||	\
+	((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) ||	\
+	((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) ||	\
+	((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) ||		\
+	((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) ||		\
+	((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) ||	\
+	((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) ||	\
+	((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
+
 typedef struct dof_ecbdesc {
 	dof_secidx_t dofe_probes;	/* link to DOF_SECT_PROBEDESC */
 	dof_secidx_t dofe_pred;		/* link to DOF_SECT_DIFOHDR */
@@ -930,10 +949,10 @@
  * DTrace Metadata Description Structures
  *
  * DTrace separates the trace data stream from the metadata stream.  The only
- * metadata tokens placed in the data stream are enabled probe identifiers
- * (EPIDs) or (in the case of aggregations) aggregation identifiers.  In order
- * to determine the structure of the data, DTrace consumers pass the token to
- * the kernel, and receive in return a corresponding description of the enabled
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers.  To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
  * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
  * dtrace_aggdesc structure).  Both of these structures are expressed in terms
  * of record descriptions (via the dtrace_recdesc structure) that describe the
@@ -1028,7 +1047,12 @@
 #define	DTRACEOPT_AGGSORTREV	24	/* reverse-sort aggregations */
 #define	DTRACEOPT_AGGSORTPOS	25	/* agg. position to sort on */
 #define	DTRACEOPT_AGGSORTKEYPOS	26	/* agg. key position to sort on */
-#define	DTRACEOPT_MAX		27	/* number of options */
+#define	DTRACEOPT_TEMPORAL	27	/* temporally ordered output */
+#define	DTRACEOPT_AGGHIST	28	/* histogram aggregation output */
+#define	DTRACEOPT_AGGPACK	29	/* packed aggregation output */
+#define	DTRACEOPT_AGGZOOM	30	/* zoomed aggregation scaling */
+#define	DTRACEOPT_ZONE		31	/* zone in which to enable probes */
+#define	DTRACEOPT_MAX		32	/* number of options */
 
 #define	DTRACEOPT_UNSET		(dtrace_optval_t)-2	/* unset option */
 
@@ -1048,7 +1072,9 @@
  * where user-level wishes the kernel to snapshot the buffer to (the
  * dtbd_data field).  The kernel uses the same structure to pass back some
  * information regarding the buffer:  the size of data actually copied out, the
- * number of drops, the number of errors, and the offset of the oldest record.
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
  * If the buffer policy is a "switch" policy, taking a snapshot of the
  * principal buffer has the additional effect of switching the active and
  * inactive buffers.  Taking a snapshot of the aggregation buffer _always_ has
@@ -1061,9 +1087,30 @@
 	uint64_t dtbd_drops;			/* number of drops */
 	DTRACE_PTR(char, dtbd_data);		/* data */
 	uint64_t dtbd_oldest;			/* offset of oldest record */
+	uint64_t dtbd_timestamp;		/* hrtime of snapshot */
 } dtrace_bufdesc_t;
 
 /*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp.  The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+	dtrace_epid_t dtrh_epid;		/* enabled probe id */
+	uint32_t dtrh_timestamp_hi;		/* high bits of hrtime_t */
+	uint32_t dtrh_timestamp_lo;		/* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define	DTRACE_RECORD_LOAD_TIMESTAMP(dtrh)			\
+	((dtrh)->dtrh_timestamp_lo +				\
+	((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define	DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) {		\
+	(dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime;		\
+	(dtrh)->dtrh_timestamp_hi = hrtime >> 32;		\
+}
+
+/*
  * DTrace Status
  *
  * The status of DTrace is relayed via the dtrace_status structure.  This
@@ -1229,7 +1276,7 @@
  * pseudodevice driver.  These ioctls comprise the user-kernel interface to
  * DTrace.
  */
-#if defined(sun)
+#ifdef illumos
 #define	DTRACEIOC		(('d' << 24) | ('t' << 16) | ('r' << 8))
 #define	DTRACEIOC_PROVIDER	(DTRACEIOC | 1)		/* provider query */
 #define	DTRACEIOC_PROBES	(DTRACEIOC | 2)		/* probe query */
@@ -1360,7 +1407,7 @@
  * helpers and should no longer be used.  No other ioctls are valid on the
  * helper minor node.
  */
-#if defined(sun)
+#ifdef illumos
 #define	DTRACEHIOC		(('d' << 24) | ('t' << 16) | ('h' << 8))
 #define	DTRACEHIOC_ADD		(DTRACEHIOC | 1)	/* add helper */
 #define	DTRACEHIOC_REMOVE	(DTRACEHIOC | 2)	/* remove helper */
@@ -1375,7 +1422,7 @@
 	char dofhp_mod[DTRACE_MODNAMELEN];	/* executable or library name */
 	uint64_t dofhp_addr;			/* base address of object */
 	uint64_t dofhp_dof;			/* address of helper DOF */
-#if !defined(sun)
+#ifndef illumos
 	int gen;
 #endif
 } dof_helper_t;
@@ -1680,7 +1727,22 @@
  *
  * 1.10.3  Return value
  *
- *   A boolean value.
+ *   A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL
+ *   or DTRACE_MODE_USER) and the policy when the privilege of the enabling
+ *   is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP,
+ *   DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT).  If
+ *   DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result
+ *   in the probe firing being silently ignored for the enabling; if the
+ *   DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not
+ *   prevent probe processing for the enabling, but restrictions will be in
+ *   place that induce a UPRIV fault upon attempt to examine probe arguments
+ *   or current process state.  If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit
+ *   is set, similar restrictions will be placed upon operation if the
+ *   privilege is sufficient to process the enabling, but does not otherwise
+ *   entitle the enabling to all zones.  The DTRACE_MODE_NOPRIV_DROP and
+ *   DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these
+ *   two policies must be specified), but either may be combined (or not)
+ *   with DTRACE_MODE_LIMITEDPRIV_RESTRICT.
  *
  * 1.10.4  Caller's context
  *
@@ -2075,6 +2137,12 @@
 	void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg);
 } dtrace_pops_t;
 
+#define	DTRACE_MODE_KERNEL			0x01
+#define	DTRACE_MODE_USER			0x02
+#define	DTRACE_MODE_NOPRIV_DROP			0x10
+#define	DTRACE_MODE_NOPRIV_RESTRICT		0x20
+#define	DTRACE_MODE_LIMITEDPRIV_RESTRICT	0x40
+
 typedef uintptr_t	dtrace_provider_id_t;
 
 extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t,
@@ -2256,7 +2324,7 @@
 	DTRACE_VTIME_ACTIVE_TNF		/* DTrace virtual time _and_ TNF */
 } dtrace_vtime_state_t;
 
-#if defined(sun)
+#ifdef illumos
 extern dtrace_vtime_state_t dtrace_vtime_active;
 #endif
 extern void dtrace_vtime_switch(kthread_t *next);
@@ -2268,7 +2336,7 @@
 struct regs;
 struct reg;
 
-#if defined(sun)
+#ifdef illumos
 extern int (*dtrace_pid_probe_ptr)(struct reg *);
 extern int (*dtrace_return_probe_ptr)(struct reg *);
 extern void (*dtrace_fasttrap_fork_ptr)(proc_t *, proc_t *);
@@ -2287,18 +2355,21 @@
 extern void dtrace_membar_consumer(void);
 
 extern void (*dtrace_cpu_init)(processorid_t);
+#ifdef illumos
 extern void (*dtrace_modload)(modctl_t *);
 extern void (*dtrace_modunload)(modctl_t *);
+#endif
 extern void (*dtrace_helpers_cleanup)(void);
 extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child);
 extern void (*dtrace_cpustart_init)(void);
 extern void (*dtrace_cpustart_fini)(void);
+extern void (*dtrace_closef)(void);
 
 extern void (*dtrace_debugger_init)(void);
 extern void (*dtrace_debugger_fini)(void);
 extern dtrace_cacheid_t dtrace_predcache_id;
 
-#if defined(sun)
+#ifdef illumos
 extern hrtime_t dtrace_gethrtime(void);
 #else
 void dtrace_debug_printf(const char *, ...) __printflike(1, 2);
@@ -2317,10 +2388,10 @@
 #if defined(__i386) || defined(__amd64)
 extern int dtrace_instr_size(uchar_t *instr);
 extern int dtrace_instr_size_isa(uchar_t *, model_t, int *);
+extern void dtrace_invop_callsite(void);
+#endif
 extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t));
 extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t));
-extern void dtrace_invop_callsite(void);
-#endif
 
 #ifdef __sparc
 extern int dtrace_blksuword32(uintptr_t, uint32_t *, int);
@@ -2327,7 +2398,7 @@
 extern void dtrace_getfsr(uint64_t *);
 #endif
 
-#if !defined(sun)
+#ifndef illumos
 extern void dtrace_helpers_duplicate(proc_t *, proc_t *);
 extern void dtrace_helpers_destroy(proc_t *);
 #endif
@@ -2353,6 +2424,15 @@
 #define	DTRACE_INVOP_NOP		4
 #define	DTRACE_INVOP_RET		5
 
+#elif defined(__powerpc__)
+
+#define DTRACE_INVOP_RET	1
+#define DTRACE_INVOP_BCTR	2
+#define DTRACE_INVOP_BLR	3
+#define DTRACE_INVOP_JUMP	4
+#define DTRACE_INVOP_MFLR_R0	5
+#define DTRACE_INVOP_NOP	6
+
 #endif
 
 #ifdef	__cplusplus

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -18,11 +19,12 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 250484 2013-05-10 21:12:55Z pfg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 313486 2017-02-09 22:04:56Z ngie $
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -49,7 +51,7 @@
  */
 
 #include <sys/dtrace.h>
-#if !defined(sun)
+#ifndef illumos
 #ifdef __sparcv9
 typedef uint32_t		pc_t;
 #else
@@ -209,15 +211,18 @@
  * predicate is non-NULL, the DIF object is executed.  If the result is
  * non-zero, the action list is processed, with each action being executed
  * accordingly.  When the action list has been completely executed, processing
- * advances to the next ECB.  processing advances to the next ECB.  If the
- * result is non-zero; For each ECB, it first determines the The ECB
- * abstraction allows disjoint consumers to multiplex on single probes.
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data.  During execution, dte_needed bytes must be available in
+ * the buffer.  This space is used for both recorded data and tuple data.
  */
 struct dtrace_ecb {
 	dtrace_epid_t dte_epid;			/* enabled probe ID */
 	uint32_t dte_alignment;			/* required alignment */
-	size_t dte_needed;			/* bytes needed */
-	size_t dte_size;			/* total size of payload */
+	size_t dte_needed;			/* space needed for execution */
+	size_t dte_size;			/* size of recorded payload */
 	dtrace_predicate_t *dte_predicate;	/* predicate, if any */
 	dtrace_action_t *dte_action;		/* actions, if any */
 	dtrace_ecb_t *dte_next;			/* next ECB on probe */
@@ -275,27 +280,30 @@
  * the EPID, the consumer can determine the data layout.  (The data buffer
  * layout is shown schematically below.)  By assuring that one can determine
  * data layout from the EPID, the metadata stream can be separated from the
- * data stream -- simplifying the data stream enormously.
+ * data stream -- simplifying the data stream enormously.  The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
  *
- *      base of data buffer --->  +------+--------------------+------+
- *                                | EPID | data               | EPID |
- *                                +------+--------+------+----+------+
- *                                | data          | EPID | data      |
- *                                +---------------+------+-----------+
- *                                | data, cont.                      |
- *                                +------+--------------------+------+
- *                                | EPID | data               |      |
- *                                +------+--------------------+      |
- *                                |                ||                |
- *                                |                ||                |
- *                                |                \/                |
- *                                :                                  :
- *                                .                                  .
- *                                .                                  .
- *                                .                                  .
- *                                :                                  :
- *                                |                                  |
- *     limit of data buffer --->  +----------------------------------+
+ *      base of data buffer --->  +--------+--------------------+--------+
+ *                                | rechdr | data               | rechdr |
+ *                                +--------+------+--------+----+--------+
+ *                                | data          | rechdr | data        |
+ *                                +---------------+--------+-------------+
+ *                                | data, cont.                          |
+ *                                +--------+--------------------+--------+
+ *                                | rechdr | data               |        |
+ *                                +--------+--------------------+        |
+ *                                |                ||                    |
+ *                                |                ||                    |
+ *                                |                \/                    |
+ *                                :                                      :
+ *                                .                                      .
+ *                                .                                      .
+ *                                .                                      .
+ *                                :                                      :
+ *                                |                                      |
+ *     limit of data buffer --->  +--------------------------------------+
  *
  * When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
  * principal buffer (both scratch and payload) exceed the available space.  If
@@ -927,6 +935,7 @@
 	uintptr_t dtms_strtok;			/* saved strtok() pointer */
 	uint32_t dtms_access;			/* memory access rights */
 	dtrace_difo_t *dtms_difo;		/* current dif object */
+	file_t *dtms_getf;			/* cached rval of getf() */
 } dtrace_mstate_t;
 
 #define	DTRACE_COND_OWNER	0x1
@@ -1114,7 +1123,7 @@
  * dtrace_state structure.
  */
 struct dtrace_state {
-#if defined(sun)
+#ifdef illumos
 	dev_t dts_dev;				/* device */
 #else
 	struct cdev *dts_dev;			/* device */
@@ -1132,7 +1141,7 @@
 	int dts_nspeculations;			/* number of speculations */
 	int dts_naggregations;			/* number of aggregations */
 	dtrace_aggregation_t **dts_aggregations; /* aggregation array */
-#if defined(sun)
+#ifdef illumos
 	vmem_t *dts_aggid_arena;		/* arena for aggregation IDs */
 #else
 	struct unrhdr *dts_aggid_arena;		/* arena for aggregation IDs */
@@ -1144,7 +1153,7 @@
 	uint32_t dts_dblerrors;			/* errors in ERROR probes */
 	uint32_t dts_reserve;			/* space reserved for END */
 	hrtime_t dts_laststatus;		/* time of last status */
-#if defined(sun)
+#ifdef illumos
 	cyclic_id_t dts_cleaner;		/* cleaning cyclic */
 	cyclic_id_t dts_deadman;		/* deadman cyclic */
 #else
@@ -1159,6 +1168,7 @@
 	dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */
 	dtrace_cred_t dts_cred;			/* credentials */
 	size_t dts_nretained;			/* number of retained enabs */
+	int dts_getf;				/* number of getf() calls */
 };
 
 struct dtrace_provider {
@@ -1261,7 +1271,11 @@
 	uintptr_t	dtt_limit;		/* limit of toxic range */
 } dtrace_toxrange_t;
 
+#ifdef illumos
 extern uint64_t dtrace_getarg(int, int);
+#else
+extern uint64_t __noinline dtrace_getarg(int, int);
+#endif
 extern greg_t dtrace_getfp(void);
 extern int dtrace_getipl(void);
 extern uintptr_t dtrace_caller(int);
@@ -1287,7 +1301,7 @@
     int, uintptr_t);
 extern int dtrace_assfail(const char *, const char *, int);
 extern int dtrace_attached(void);
-#if defined(sun)
+#ifdef illumos
 extern hrtime_t dtrace_gethrestime(void);
 #endif
 
@@ -1304,16 +1318,19 @@
 /*
  * DTrace Assertions
  *
- * DTrace calls ASSERT from probe context.  To assure that a failed ASSERT
- * does not induce a markedly more catastrophic failure (e.g., one from which
- * a dump cannot be gleaned), DTrace must define its own ASSERT to be one that
- * may safely be called from probe context.  This header file must thus be
- * included by any DTrace component that calls ASSERT from probe context, and
- * _only_ by those components.  (The only exception to this is kernel
- * debugging infrastructure at user-level that doesn't depend on calling
- * ASSERT.)
+ * DTrace calls ASSERT and VERIFY from probe context.  To assure that a failed
+ * ASSERT or VERIFY does not induce a markedly more catastrophic failure (e.g.,
+ * one from which a dump cannot be gleaned), DTrace must define its own ASSERT
+ * and VERIFY macros to be ones that may safely be called from probe context.
+ * This header file must thus be included by any DTrace component that calls
+ * ASSERT and/or VERIFY from probe context, and _only_ by those components.
+ * (The only exception to this is kernel debugging infrastructure at user-level
+ * that doesn't depend on calling ASSERT.)
  */
 #undef ASSERT
+#undef VERIFY
+#define	VERIFY(EX)	((void)((EX) || \
+			dtrace_assfail(#EX, __FILE__, __LINE__)))
 #ifdef DEBUG
 #define	ASSERT(EX)	((void)((EX) || \
 			dtrace_assfail(#EX, __FILE__, __LINE__)))

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -37,13 +38,13 @@
 extern "C" {
 #endif
 
-#if defined(sun)
+#ifdef illumos
 #define	FASTTRAPIOC		(('m' << 24) | ('r' << 16) | ('f' << 8))
 #define	FASTTRAPIOC_MAKEPROBE	(FASTTRAPIOC | 1)
 #define	FASTTRAPIOC_GETINSTR	(FASTTRAPIOC | 2)
 #else
-#define	FASTTRAPIOC_MAKEPROBE	_IOW('f', 1, fasttrap_probe_spec_t)
 #define	FASTTRAPIOC_GETINSTR	_IOWR('f', 2, uint8_t)
+#define	FASTTRAPIOC_MAKEPROBE	_IO('f', 3)
 #endif
 
 typedef enum fasttrap_probe_type {

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -32,6 +33,7 @@
 #include <sys/types.h>
 #include <sys/dtrace.h>
 #include <sys/proc.h>
+#include <sys/queue.h>
 #include <sys/fasttrap.h>
 #include <sys/fasttrap_isa.h>
 
@@ -68,8 +70,30 @@
  * then disabled, ownership of that tracepoint may be exchanged for an
  * unused tracepoint belonging to another probe that was attached to the
  * enabled tracepoint.
+ *
+ * On FreeBSD, fasttrap providers also maintain per-thread scratch space for use
+ * by the ISA-specific fasttrap code. The fasttrap_scrblock_t type stores the
+ * virtual address of a page-sized memory block that is mapped into a process'
+ * address space. Each block is carved up into chunks (fasttrap_scrspace_t) for
+ * use by individual threads, which keep the address of their scratch space
+ * chunk in their struct kdtrace_thread. A thread's scratch space isn't released
+ * until it exits.
  */
 
+#ifndef illumos
+typedef struct fasttrap_scrblock {
+	vm_offset_t ftsb_addr;			/* address of a scratch block */
+	LIST_ENTRY(fasttrap_scrblock) ftsb_next;/* next block in list */
+} fasttrap_scrblock_t;
+#define	FASTTRAP_SCRBLOCK_SIZE	PAGE_SIZE
+
+typedef struct fasttrap_scrspace {
+	uintptr_t ftss_addr;			/* scratch space address */
+	LIST_ENTRY(fasttrap_scrspace) ftss_next;/* next in list */
+} fasttrap_scrspace_t;
+#define	FASTTRAP_SCRSPACE_SIZE	64
+#endif
+
 typedef struct fasttrap_proc {
 	pid_t ftpc_pid;				/* process ID for this proc */
 	uint64_t ftpc_acount;			/* count of active providers */
@@ -76,6 +100,11 @@
 	uint64_t ftpc_rcount;			/* count of extant providers */
 	kmutex_t ftpc_mtx;			/* lock on all but acount */
 	struct fasttrap_proc *ftpc_next;	/* next proc in hash chain */
+#ifndef illumos
+	LIST_HEAD(, fasttrap_scrblock) ftpc_scrblks; /* mapped scratch blocks */
+	LIST_HEAD(, fasttrap_scrspace) ftpc_fscr; /* free scratch space */
+	LIST_HEAD(, fasttrap_scrspace) ftpc_ascr; /* used scratch space */
+#endif
 } fasttrap_proc_t;
 
 typedef struct fasttrap_provider {
@@ -158,22 +187,30 @@
  */
 #define	fasttrap_copyout	copyout
 #define	fasttrap_fuword32	fuword32
-#define	fasttrap_suword32(_k, _u)	copyout((_k), (_u), sizeof(uint32_t))
-#define	fasttrap_suword64(_k, _u)	copyout((_k), (_u), sizeof(uint64_t))
+#define	fasttrap_suword32	suword32
+#define	fasttrap_suword64	suword64
 
 #ifdef __amd64__
 #define	fasttrap_fulword	fuword64
-#define	fasttrap_sulword	fasttrap_suword64
+#define	fasttrap_sulword	suword64
 #else
 #define	fasttrap_fulword	fuword32
-#define	fasttrap_sulword	fasttrap_suword32
+#define	fasttrap_sulword	suword32
 #endif
 
 extern void fasttrap_sigtrap(proc_t *, kthread_t *, uintptr_t);
+#ifndef illumos
+extern fasttrap_scrspace_t *fasttrap_scraddr(struct thread *,
+    fasttrap_proc_t *);
+#endif
 
 extern dtrace_id_t 		fasttrap_probe_id;
 extern fasttrap_hash_t		fasttrap_tpoints;
 
+#ifndef illumos
+extern struct rmlock		fasttrap_tp_lock;
+#endif
+
 #define	FASTTRAP_TPOINTS_INDEX(pid, pc) \
 	(((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask)
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,6 +21,8 @@
  */
 
 /*
+ * Copyright 2013 Garrett D'Amore <garrett at damore.org>
+ *
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -42,6 +45,7 @@
  *		199309L	    POSIX.1b-1993 compilation (Real Time)
  *		199506L	    POSIX.1c-1995 compilation (POSIX Threads)
  *		200112L	    POSIX.1-2001 compilation (Austin Group Revision)
+ *		200809L     POSIX.1-2008 compilation
  */
 #if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
 #define	_POSIX_C_SOURCE 1
@@ -48,9 +52,9 @@
 #endif
 
 /*
- * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99
- * are Sun implementation specific macros created in order to compress
- * common standards specified feature test macros for easier reading.
+ * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, _STRICT_SYMBOLS,
+ * and _STDC_C99 are Sun implementation specific macros created in order to
+ * compress common standards specified feature test macros for easier reading.
  * These macros should not be used by the application developer as
  * unexpected results may occur. Instead, the user should reference
  * standards(5) for correct usage of the standards feature test macros.
@@ -76,6 +80,10 @@
  *                      the C standard. A value of 199901L indicates a
  *                      compiler that complies with ISO/IEC 9899:1999, other-
  *                      wise known as the C99 standard.
+ *
+ * _STRICT_SYMBOLS	Used in cases where symbol visibility is restricted
+ *                      by the standards, and the user has not explicitly
+ *                      relaxed the strictness via __EXTENSIONS__.
  */
 
 #if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
@@ -145,6 +153,14 @@
 #endif
 
 /*
+ * Use strict symbol visibility.
+ */
+#if (defined(_STRICT_STDC) || defined(__XOPEN_OR_POSIX)) && \
+	!defined(__EXTENSIONS__)
+#define	_STRICT_SYMBOLS
+#endif
+
+/*
  * Large file interfaces:
  *
  *	_LARGEFILE_SOURCE
@@ -223,6 +239,8 @@
  * X/Open CAE Specification, Issue 5 (XPG5)
  * Open Group Technical Standard, Issue 6 (XPG6), also referred to as
  *    IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
+ * Open Group Technical Standard, Issue 7 (XPG7), also referred to as
+ *    IEEE Std. 1003.1-2008 and ISO/IEC 9945:2009.
  *
  * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
  * XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
@@ -230,6 +248,7 @@
  * XPG6 is the result of a merge of the X/Open and POSIX specifications
  *     and as such is also referred to as IEEE Std. 1003.1-2001 in
  *     addition to UNIX 03 and SUSv3.
+ * XPG7 is also referred to as UNIX 08 and SUSv4.
  *
  * When writing a conforming X/Open application, as per the specification
  * requirements, the appropriate feature test macros must be defined at
@@ -242,6 +261,7 @@
  * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1           XPG4v2
  * _XOPEN_SOURCE = 500                                   XPG5
  * _XOPEN_SOURCE = 600  (or POSIX_C_SOURCE=200112L)      XPG6
+ * _XOPEN_SOURCE = 700  (or POSIX_C_SOURCE=200809L)      XPG7
  *
  * In order to simplify the guards within the headers, the following
  * implementation private test macros have been created. Applications
@@ -261,6 +281,7 @@
  * _XPG4_2  X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
  * _XPG5    X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
  * _XPG6    Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
+ * _XPG7    Open Group Technical Standard, Issue 7 (XPG7/UNIX 08/SUSv4)
  */
 
 /* X/Open Portability Guide, Issue 3 */
@@ -295,6 +316,19 @@
 #define	_POSIX_C_SOURCE			200112L
 #undef	_XOPEN_SOURCE
 #define	_XOPEN_SOURCE			600
+
+/* Open Group Technical Standard, Issue 7 */
+#elif	(_XOPEN_SOURCE - 0 == 700) || (_POSIX_C_SOURCE - 0 == 200809L)
+#define	_XPG7
+#define	_XPG6
+#define	_XPG5
+#define	_XPG4_2
+#define	_XPG4
+#define	_XPG3
+#undef	_POSIX_C_SOURCE
+#define	_POSIX_C_SOURCE			200809L
+#undef	_XOPEN_SOURCE
+#define	_XOPEN_SOURCE			700
 #endif
 
 /*
@@ -305,12 +339,15 @@
  * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
  * _XOPEN_VERSION  defined with a value of 500 indicates an XPG5 (UNIX 98)
  * application and with a value of 600 indicates an XPG6 (UNIX 03)
- * application.  The appropriate version is determined by the use of the
+ * application and with a value of 700 indicates an XPG7 (UNIX 08).
+ * The appropriate version is determined by the use of the
  * feature test macros described earlier.  The value of _XOPEN_VERSION
  * defaults to 3 otherwise indicating support for XPG3 applications.
  */
 #ifndef _XOPEN_VERSION
-#ifdef	_XPG6
+#if	defined(_XPG7)
+#define	_XOPEN_VERSION 700
+#elif	defined(_XPG6)
 #define	_XOPEN_VERSION 600
 #elif defined(_XPG5)
 #define	_XOPEN_VERSION 500

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -21,10 +22,11 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -49,12 +51,17 @@
  * combined into masks that can be passed to various functions.
  */
 typedef enum {
-	ZFS_TYPE_FILESYSTEM	= 0x1,
-	ZFS_TYPE_SNAPSHOT	= 0x2,
-	ZFS_TYPE_VOLUME		= 0x4,
-	ZFS_TYPE_POOL		= 0x8
+	ZFS_TYPE_FILESYSTEM	= (1 << 0),
+	ZFS_TYPE_SNAPSHOT	= (1 << 1),
+	ZFS_TYPE_VOLUME		= (1 << 2),
+	ZFS_TYPE_POOL		= (1 << 3),
+	ZFS_TYPE_BOOKMARK	= (1 << 4)
 } zfs_type_t;
 
+/*
+ * NB: lzc_dataset_type should be updated whenever a new objset type is added,
+ * if it represents a real type of a dataset that can be created from userland.
+ */
 typedef enum dmu_objset_type {
 	DMU_OST_NONE,
 	DMU_OST_META,
@@ -68,9 +75,13 @@
 #define	ZFS_TYPE_DATASET	\
 	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
 
+/*
+ * All of these include the terminating NUL byte.
+ */
 #define	ZAP_MAXNAMELEN 256
 #define	ZAP_MAXVALUELEN (1024 * 8)
 #define	ZAP_OLDMAXVALUELEN 1024
+#define	ZFS_MAX_DATASET_NAME_LEN 256
 
 /*
  * Dataset properties are identified by these constants and must be added to
@@ -143,6 +154,15 @@
 	ZFS_PROP_CLONES,
 	ZFS_PROP_LOGICALUSED,
 	ZFS_PROP_LOGICALREFERENCED,
+	ZFS_PROP_INCONSISTENT,		/* not exposed to the user */
+	ZFS_PROP_VOLMODE,
+	ZFS_PROP_FILESYSTEM_LIMIT,
+	ZFS_PROP_SNAPSHOT_LIMIT,
+	ZFS_PROP_FILESYSTEM_COUNT,
+	ZFS_PROP_SNAPSHOT_COUNT,
+	ZFS_PROP_REDUNDANT_METADATA,
+	ZFS_PROP_PREV_SNAP,
+	ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -185,6 +205,10 @@
 	ZPOOL_PROP_COMMENT,
 	ZPOOL_PROP_EXPANDSZ,
 	ZPOOL_PROP_FREEING,
+	ZPOOL_PROP_FRAGMENTATION,
+	ZPOOL_PROP_LEAKED,
+	ZPOOL_PROP_MAXBLOCKSIZE,
+	ZPOOL_PROP_TNAME,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -335,7 +359,18 @@
 	ZFS_SYNC_DISABLED = 2
 } zfs_sync_type_t;
 
+typedef enum {
+	ZFS_VOLMODE_DEFAULT = 0,
+	ZFS_VOLMODE_GEOM = 1,
+	ZFS_VOLMODE_DEV = 2,
+	ZFS_VOLMODE_NONE = 3
+} zfs_volmode_t;
 
+typedef enum {
+	ZFS_REDUNDANT_METADATA_ALL,
+	ZFS_REDUNDANT_METADATA_MOST
+} zfs_redundant_metadata_type_t;
+
 /*
  * On-disk version number.
  */
@@ -523,7 +558,7 @@
 #define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
 #define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
 #define	ZPOOL_CONFIG_REMOVING		"removing"
-#define	ZPOOL_CONFIG_RESILVERING	"resilvering"
+#define	ZPOOL_CONFIG_RESILVER_TXG	"resilver_txg"
 #define	ZPOOL_CONFIG_COMMENT		"comment"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
@@ -574,10 +609,19 @@
 
 /*
  * This is needed in userland to report the minimum necessary device size.
+ *
+ * Note that the zfs test suite uses 64MB vdevs.
  */
 #define	SPA_MINDEVSIZE		(64ULL << 20)
 
 /*
+ * Set if the fragmentation has not yet been calculated. This can happen
+ * because the space maps have not been upgraded or the histogram feature
+ * is not enabled.
+ */
+#define	ZFS_FRAG_INVALID	UINT64_MAX
+
+/*
  * The location of the pool configuration repository, shared between kernel and
  * userland.
  */
@@ -620,7 +664,8 @@
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
 	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
 	VDEV_AUX_EXTERNAL,	/* external diagnosis			*/
-	VDEV_AUX_SPLIT_POOL	/* vdev was split off into another pool	*/
+	VDEV_AUX_SPLIT_POOL,	/* vdev was split off into another pool	*/
+	VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large   */
 } vdev_aux_t;
 
 /*
@@ -714,7 +759,14 @@
 	uint64_t	vs_self_healed;		/* self-healed bytes	*/
 	uint64_t	vs_scan_removing;	/* removing?	*/
 	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
+ 	uint64_t	vs_configured_ashift;	/* TLV vdev_ashift      */
+ 	uint64_t	vs_logical_ashift;	/* vdev_logical_ashift  */
+ 	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
+	uint64_t	vs_fragmentation;	/* device fragmentation */
 } vdev_stat_t;
+#define VDEV_STAT_VALID(field, uint64_t_field_count) \
+    ((uint64_t_field_count * sizeof(uint64_t)) >= \
+     (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
 
 /*
  * DDT statistics.  Note: all fields should be 64-bit because this
@@ -745,6 +797,10 @@
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV_NAME	"zfs"
 #define	ZFS_DEV		"/dev/" ZFS_DEV_NAME
+#define	ZFS_DISK_ROOT	"/dev/dsk"
+#define	ZFS_DISK_ROOTD	ZFS_DISK_ROOT "/"
+#define	ZFS_RDISK_ROOT	"/dev/rdsk"
+#define	ZFS_RDISK_ROOTD	ZFS_RDISK_ROOT "/"
 
 /* general zvol path */
 #define	ZVOL_DIR		"/dev/zvol"
@@ -831,6 +887,10 @@
 	ZFS_IOC_SEND_NEW,
 	ZFS_IOC_SEND_SPACE,
 	ZFS_IOC_CLONE,
+	ZFS_IOC_BOOKMARK,
+	ZFS_IOC_GET_BOOKMARKS,
+	ZFS_IOC_DESTROY_BOOKMARKS,
+	ZFS_IOC_NEXTBOOT,
 	ZFS_IOC_LAST
 } zfs_ioc_t;
 
@@ -843,7 +903,8 @@
 	SPA_LOAD_IMPORT,	/* import in progress	*/
 	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
 	SPA_LOAD_RECOVER,	/* recovery requested	*/
-	SPA_LOAD_ERROR		/* load failed		*/
+	SPA_LOAD_ERROR,		/* load failed		*/
+	SPA_LOAD_CREATE		/* creation in progress */
 } spa_load_state_t;
 
 /*
@@ -892,6 +953,7 @@
 #define	ZFS_IMPORT_ANY_HOST	0x2
 #define	ZFS_IMPORT_MISSING_LOG	0x4
 #define	ZFS_IMPORT_ONLY		0x8
+#define	ZFS_IMPORT_TEMP_NAME	0x20
 
 /*
  * Sysevent payload members.  ZFS will generate the following sysevents with the

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -235,7 +236,7 @@
 /*
  * Define the appropriate "processor characteristics"
  */
-#if defined(sun)
+#ifdef illumos
 #define	_LITTLE_ENDIAN
 #endif
 #define	_STACK_GROWS_DOWNWARD
@@ -302,7 +303,7 @@
 /*
  * Define the appropriate "processor characteristics"
  */
-#if defined(sun)
+#ifdef illumos
 #define	_LITTLE_ENDIAN
 #endif
 #define	_STACK_GROWS_DOWNWARD
@@ -504,7 +505,7 @@
  * Define the appropriate "processor characteristics" shared between
  * all Solaris on SPARC systems.
  */
-#if defined(sun)
+#ifdef illumos
 #define	_BIG_ENDIAN
 #endif
 #define	_STACK_GROWS_DOWNWARD

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -25,6 +26,8 @@
  */
 
 /*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -112,7 +115,6 @@
  * User-level system call interface prototypes
  */
 #ifndef _KERNEL
-#ifdef __STDC__
 
 extern int	p_online(processorid_t processorid, int flag);
 extern int	processor_info(processorid_t processorid,
@@ -122,16 +124,6 @@
 extern processorid_t getcpuid(void);
 extern lgrpid_t gethomelgroup(void);
 
-#else
-
-extern int	p_online();
-extern int	processor_info();
-extern int	processor_bind();
-extern processorid_t getcpuid();
-extern lgrpid_t gethomelgroup();
-
-#endif /* __STDC__ */
-
 #else   /* _KERNEL */
 
 /*

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -143,7 +144,7 @@
 
 #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
 
-#if defined(sun)
+#ifdef illumos
 #ifdef _KERNEL
 
 struct proc;

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -20,7 +21,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_SYSEVENT_EVENTDEFS_H
@@ -249,9 +250,16 @@
 #define	ESC_ZFS_RESILVER_START		"ESC_ZFS_resilver_start"
 #define	ESC_ZFS_RESILVER_FINISH		"ESC_ZFS_resilver_finish"
 #define	ESC_ZFS_VDEV_REMOVE		"ESC_ZFS_vdev_remove"
+#define	ESC_ZFS_VDEV_REMOVE_AUX		"ESC_ZFS_vdev_remove_aux"
+#define	ESC_ZFS_VDEV_REMOVE_DEV		"ESC_ZFS_vdev_remove_dev"
+#define	ESC_ZFS_POOL_CREATE		"ESC_ZFS_pool_create"
 #define	ESC_ZFS_POOL_DESTROY		"ESC_ZFS_pool_destroy"
+#define	ESC_ZFS_POOL_IMPORT		"ESC_ZFS_pool_import"
+#define	ESC_ZFS_VDEV_ADD		"ESC_ZFS_vdev_add"
+#define	ESC_ZFS_VDEV_ATTACH		"ESC_ZFS_vdev_attach"
 #define	ESC_ZFS_VDEV_CLEAR		"ESC_ZFS_vdev_clear"
 #define	ESC_ZFS_VDEV_CHECK		"ESC_ZFS_vdev_check"
+#define	ESC_ZFS_VDEV_ONLINE		"ESC_ZFS_vdev_online"
 #define	ESC_ZFS_CONFIG_SYNC		"ESC_ZFS_config_sync"
 #define	ESC_ZFS_SCRUB_START		"ESC_ZFS_scrub_start"
 #define	ESC_ZFS_SCRUB_FINISH		"ESC_ZFS_scrub_finish"

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -214,7 +215,7 @@
 #define	EVCH_SET_CHAN_LEN	 3	/* Set event queue length */
 #define	EVCH_CMD_LAST		 EVCH_SET_CHAN_LEN	/* Last command */
 
-#ifdef sun
+#ifdef illumos
 /*
  * Shared user/kernel event channel interface definitions
  */
@@ -228,11 +229,11 @@
 extern int sysevent_evc_control(evchan_t *, int, ...);
 extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
 extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
-#endif	/* sun */
+#endif	/* illumos */
 
 #ifndef	_KERNEL
 
-#ifdef sun
+#ifdef illumos
 /*
  * Userland-only event channel interfaces
  */
@@ -254,7 +255,7 @@
 
 extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
     int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
-#endif	/* sun */
+#endif	/* illumos */
 
 #else
 
@@ -270,7 +271,7 @@
 extern void sysevent_free_attr(sysevent_attr_list_t *);
 extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
 extern void sysevent_detach_attributes(sysevent_t *);
-#ifdef sun
+#ifdef illumos
 extern char *sysevent_get_class_name(sysevent_t *);
 extern char *sysevent_get_subclass_name(sysevent_t *);
 extern uint64_t sysevent_get_seq(sysevent_t *);
@@ -278,7 +279,7 @@
 extern size_t sysevent_get_size(sysevent_t *);
 extern char *sysevent_get_pub(sysevent_t *);
 extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
-#endif	/* sun */
+#endif	/* illumos */
 
 #endif	/* _KERNEL */
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -32,6 +33,9 @@
 
 #include <sys/param.h>
 #include <sys/isa_defs.h>
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/libkern.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
@@ -112,7 +116,7 @@
 #define	L_MAXMIN	L_MAXMIN32
 #endif
 
-#ifdef sun
+#ifdef illumos
 #ifdef _KERNEL
 
 /* major part of a device internal to the kernel */
@@ -172,7 +176,7 @@
 #define	getemajor(x)	(major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
 			    NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
 #define	geteminor(x)	(minor_t)((x) & L_MAXMIN)
-#endif /* sun */
+#endif /* illumos */
 
 /*
  * These are versions of the kernel routines for compressing and
@@ -382,7 +386,10 @@
 static __inline int
 highbit(ulong_t i)
 {
-	register int h = 1;
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL)
+	return (flsl(i));
+#else
+	int h = 1;
 
 	if (i == 0)
 		return (0);
@@ -407,8 +414,45 @@
 		h += 1;
 	}
 	return (h);
+#endif
 }
 
+/*
+ * Find highest one bit set.
+ *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+static __inline int
+highbit64(uint64_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL)
+	return (flsll(i));
+#else
+	int h = 1;
+
+	if (i == 0)
+		return (0);
+	if (i & 0xffffffff00000000ULL) {
+		h += 32; i >>= 32;
+	}
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+#endif
+}
+
 #ifdef	__cplusplus
 }
 #endif

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -45,6 +46,12 @@
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_ent {
+	struct task	 tqent_task;
+	task_func_t	*tqent_func;
+	void		*tqent_arg;
+} taskq_ent_t;
+
 struct proc;
 
 /*
@@ -70,24 +77,25 @@
 
 extern taskq_t *system_taskq;
 
-extern void	taskq_init(void);
-extern void	taskq_mp_init(void);
+void	taskq_init(void);
+void	taskq_mp_init(void);
 
-extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
-extern taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int,
-    int, uint_t);
-extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
+taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
-extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
+taskq_t	*taskq_create_sysdc(const char *, int, int, int,
     struct proc *, uint_t, uint_t);
-extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
-extern void	nulltask(void *);
-extern void	taskq_destroy(taskq_t *);
-extern void	taskq_wait(taskq_t *);
-extern void	taskq_suspend(taskq_t *);
-extern int	taskq_suspended(taskq_t *);
-extern void	taskq_resume(taskq_t *);
-extern int	taskq_member(taskq_t *, kthread_t *);
+taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+    taskq_ent_t *);
+void	nulltask(void *);
+void	taskq_destroy(taskq_t *);
+void	taskq_wait(taskq_t *);
+void	taskq_suspend(taskq_t *);
+int	taskq_suspended(taskq_t *);
+void	taskq_resume(taskq_t *);
+int	taskq_member(taskq_t *, kthread_t *);
 
 #endif	/* _KERNEL */
 

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -36,7 +37,7 @@
 extern "C" {
 #endif
 
-#ifdef sun
+#ifdef illumos
 /*
  * Unicode encoding conversion functions and their macros.
  */
@@ -58,7 +59,7 @@
 extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
 extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
 extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
-#endif	/* sun */
+#endif	/* illumos */
 
 /*
  * UTF-8 text preparation functions and their macros.

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.

Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,4 +1,4 @@
-/* $MidnightBSD: src/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c,v 1.2 2008/12/03 00:24:34 laffer1 Exp $ */
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *

Modified: trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -28,7 +29,7 @@
  * Use is subject to license terms.
  */
 
-#if defined(sun)
+#ifdef illumos
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 #endif
 
@@ -37,7 +38,7 @@
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <sys/cmn_err.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/regset.h>
 #include <sys/privregs.h>
 #include <sys/segments.h>
@@ -46,14 +47,15 @@
 #include <cddl/dev/dtrace/dtrace_cddl.h>
 #include <sys/types.h>
 #include <sys/proc.h>
+#include <sys/rmlock.h>
 #include <sys/dtrace_bsd.h>
-#include <cddl/dev/dtrace/i386/regset.h>
+#include <cddl/dev/dtrace/x86/regset.h>
 #include <machine/segments.h>
 #include <machine/reg.h>
 #include <machine/pcb.h>
 #endif
 #include <sys/sysmacros.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/trap.h>
 #include <sys/archsystm.h>
 #else
@@ -75,7 +77,7 @@
 	uio.uio_td = curthread;
 	uio.uio_rw = op;
 	PHOLD(p);
-	if (proc_rwmem(p, &uio) < 0) {
+	if (proc_rwmem(p, &uio) != 0) {
 		PRELE(p);
 		return (-1);
 	}
@@ -97,7 +99,7 @@
 
 	return (proc_ops(UIO_WRITE, p, kaddr, uaddr, len));
 }
-#endif /* sun */
+#endif /* illumos */
 #ifdef __i386__
 #define	r_rax	r_eax
 #define	r_rbx	r_ebx
@@ -104,6 +106,7 @@
 #define	r_rip	r_eip
 #define	r_rflags r_eflags
 #define	r_rsp	r_esp
+#define	r_rbp	r_ebp
 #endif
 
 /*
@@ -272,7 +275,20 @@
 		 * registers.
 		 */
 		if (argno < 6)
-			return ((&rp->r_rdi)[argno]);
+			switch (argno) {
+			case 0:
+				return (rp->r_rdi);
+			case 1:
+				return (rp->r_rsi);
+			case 2:
+				return (rp->r_rdx);
+			case 3:
+				return (rp->r_rcx);
+			case 4:
+				return (rp->r_r8);
+			case 5:
+				return (rp->r_r9);
+			}
 
 		stack = (uintptr_t *)rp->r_rsp;
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
@@ -733,13 +749,15 @@
 	fasttrap_tracepoint_t *tp;
 	fasttrap_bucket_t *bucket;
 	fasttrap_id_t *id;
-#if defined(sun)
+#ifdef illumos
 	kmutex_t *pid_mtx;
-#endif
 
-#if defined(sun)
 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
 	mutex_enter(pid_mtx);
+#else
+	struct rm_priotracker tracker;
+
+	rm_rlock(&fasttrap_tp_lock, &tracker);
 #endif
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
@@ -755,8 +773,10 @@
 	 * is not essential to the correct execution of the process.
 	 */
 	if (tp == NULL) {
-#if defined(sun)
+#ifdef illumos
 		mutex_exit(pid_mtx);
+#else
+		rm_runlock(&fasttrap_tp_lock, &tracker);
 #endif
 		return;
 	}
@@ -778,8 +798,10 @@
 		    rp->r_rax, rp->r_rbx, 0, 0);
 	}
 
-#if defined(sun)
+#ifdef illumos
 	mutex_exit(pid_mtx);
+#else
+	rm_runlock(&fasttrap_tp_lock, &tracker);
 #endif
 }
 
@@ -786,7 +808,7 @@
 static void
 fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr)
 {
-#if defined(sun)
+#ifdef illumos
 	sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	sqp->sq_info.si_signo = SIGSEGV;
@@ -987,10 +1009,14 @@
 fasttrap_pid_probe(struct reg *rp)
 {
 	proc_t *p = curproc;
+#ifndef illumos
+	struct rm_priotracker tracker;
+	proc_t *pp;
+#endif
 	uintptr_t pc = rp->r_rip - 1;
 	uintptr_t new_pc = 0;
 	fasttrap_bucket_t *bucket;
-#if defined(sun)
+#ifdef illumos
 	kmutex_t *pid_mtx;
 #endif
 	fasttrap_tracepoint_t *tp, tp_local;
@@ -1022,24 +1048,31 @@
 	curthread->t_dtrace_regv = 0;
 #endif
 
-#if defined(sun)
 	/*
 	 * Treat a child created by a call to vfork(2) as if it were its
 	 * parent. We know that there's only one thread of control in such a
 	 * process: this one.
 	 */
+#ifdef illumos
 	while (p->p_flag & SVFORK) {
 		p = p->p_parent;
 	}
-#endif
 
-	PROC_LOCK(p);
-	_PHOLD(p);
 	pid = p->p_pid;
-#if defined(sun)
 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
 	mutex_enter(pid_mtx);
+#else
+	pp = p;
+	sx_slock(&proctree_lock);
+	while (pp->p_vmspace == pp->p_pptr->p_vmspace)
+		pp = pp->p_pptr;
+	pid = pp->p_pid;
+	sx_sunlock(&proctree_lock);
+	pp = NULL;
+
+	rm_rlock(&fasttrap_tp_lock, &tracker);
 #endif
+
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
 	/*
@@ -1057,11 +1090,11 @@
 	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
 	 */
 	if (tp == NULL) {
-#if defined(sun)
+#ifdef illumos
 		mutex_exit(pid_mtx);
+#else
+		rm_runlock(&fasttrap_tp_lock, &tracker);
 #endif
-		_PRELE(p);
-		PROC_UNLOCK(p);
 		return (-1);
 	}
 
@@ -1183,9 +1216,10 @@
 	 * tracepoint again later if we need to light up any return probes.
 	 */
 	tp_local = *tp;
-	PROC_UNLOCK(p);
-#if defined(sun)
+#ifdef illumos
 	mutex_exit(pid_mtx);
+#else
+	rm_runlock(&fasttrap_tp_lock, &tracker);
 #endif
 	tp = &tp_local;
 
@@ -1381,17 +1415,16 @@
 	case FASTTRAP_T_PUSHL_EBP:
 	{
 		int ret = 0;
-		uintptr_t addr = 0;
 
 #ifdef __amd64
 		if (p->p_model == DATAMODEL_NATIVE) {
-			addr = rp->r_rsp - sizeof (uintptr_t);
-			ret = fasttrap_sulword((void *)addr, &rp->r_rsp);
+			rp->r_rsp -= sizeof (uintptr_t);
+			ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp);
 		} else {
 #endif
 #ifdef __i386__
-			addr = rp->r_rsp - sizeof (uint32_t);
-			ret = fasttrap_suword32((void *)addr, &rp->r_rsp);
+			rp->r_rsp -= sizeof (uint32_t);
+			ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp);
 #endif
 #ifdef __amd64
 		}
@@ -1398,12 +1431,11 @@
 #endif
 
 		if (ret == -1) {
-			fasttrap_sigsegv(p, curthread, addr);
+			fasttrap_sigsegv(p, curthread, rp->r_rsp);
 			new_pc = pc;
 			break;
 		}
 
-		rp->r_rsp = addr;
 		new_pc = pc + tp->ftt_size;
 		break;
 	}
@@ -1417,10 +1449,7 @@
 		if (tp->ftt_code == 0) {
 			new_pc = tp->ftt_dest;
 		} else {
-#ifdef __amd64
-			uintptr_t value;
-#endif
-			uintptr_t addr = tp->ftt_dest;
+			uintptr_t value, addr = tp->ftt_dest;
 
 			if (tp->ftt_base != FASTTRAP_NOREG)
 				addr += fasttrap_getreg(rp, tp->ftt_base);
@@ -1444,6 +1473,7 @@
 
 #ifdef __amd64
 				if (p->p_model == DATAMODEL_NATIVE) {
+#endif
 					if ((value = fasttrap_fulword((void *)addr))
 					     == -1) {
 						fasttrap_sigsegv(p, curthread,
@@ -1452,9 +1482,8 @@
 						break;
 					}
 					new_pc = value;
+#ifdef __amd64
 				} else {
-#endif
-#ifdef __i386__
 					uint32_t value32;
 					addr = (uintptr_t)(uint32_t)addr;
 					if ((value32 = fasttrap_fuword32((void *)addr))
@@ -1465,13 +1494,11 @@
 						break;
 					}
 					new_pc = value32;
+				}
 #endif
-				}
-#ifdef __amd64
 			} else {
 				new_pc = addr;
 			}
-#endif
 		}
 
 		/*
@@ -1487,14 +1514,12 @@
 			if (p->p_model == DATAMODEL_NATIVE) {
 				addr = rp->r_rsp - sizeof (uintptr_t);
 				pcps = pc + tp->ftt_size;
-				ret = fasttrap_sulword((void *)addr, &pcps);
+				ret = fasttrap_sulword((void *)addr, pcps);
 			} else {
 #endif
-#ifdef __i386__
 				addr = rp->r_rsp - sizeof (uint32_t);
 				pcps = (uint32_t)(pc + tp->ftt_size);
-				ret = fasttrap_suword32((void *)addr, &pcps);
-#endif
+				ret = fasttrap_suword32((void *)addr, pcps);
 #ifdef __amd64
 			}
 #endif
@@ -1519,9 +1544,8 @@
 		uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
 #endif
 		uint_t i = 0;
-#if defined(sun)
+#ifdef illumos
 		klwp_t *lwp = ttolwp(curthread);
-#endif
 
 		/*
 		 * Compute the address of the ulwp_t and step over the
@@ -1529,7 +1553,6 @@
 		 * thread pointer is very different on 32- and 64-bit
 		 * kernels.
 		 */
-#if defined(sun)
 #if defined(__amd64)
 		if (p->p_model == DATAMODEL_LP64) {
 			addr = lwp->lwp_pcb.pcb_fsbase;
@@ -1542,13 +1565,23 @@
 		addr = USD_GETBASE(&lwp->lwp_pcb.pcb_gsdesc);
 		addr += sizeof (void *);
 #endif
-#endif /* sun */
-#ifdef __i386__
-		addr = USD_GETBASE(&curthread->td_pcb->pcb_gsd);
-#else
-		addr = curthread->td_pcb->pcb_gsbase;
-#endif
-		addr += sizeof (void *);
+#else	/* !illumos */
+		fasttrap_scrspace_t *scrspace;
+		scrspace = fasttrap_scraddr(curthread, tp->ftt_proc);
+		if (scrspace == NULL) {
+			/*
+			 * We failed to allocate scratch space for this thread.
+			 * Try to write the original instruction back out and
+			 * reset the pc.
+			 */
+			if (fasttrap_copyout(tp->ftt_instr, (void *)pc,
+			    tp->ftt_size))
+				fasttrap_sigtrap(p, curthread, pc);
+			new_pc = pc;
+			break;
+		}
+		addr = scrspace->ftss_addr;
+#endif /* illumos */
 
 		/*
 		 * Generic Instruction Tracing
@@ -1734,10 +1767,10 @@
 
 		ASSERT(i <= sizeof (scratch));
 
-#if defined(sun)
+#ifdef illumos
 		if (fasttrap_copyout(scratch, (char *)addr, i)) {
 #else
-		if (uwrite(curproc, scratch, i, addr)) {
+		if (uwrite(p, scratch, i, addr)) {
 #endif
 			fasttrap_sigtrap(p, curthread, pc);
 			new_pc = pc;
@@ -1796,10 +1829,11 @@
 
 	rp->r_rip = new_pc;
 
+#ifndef illumos
 	PROC_LOCK(p);
 	proc_write_regs(curthread, rp);
-	_PRELE(p);
 	PROC_UNLOCK(p);
+#endif
 
 	return (0);
 }
@@ -1816,7 +1850,7 @@
 	curthread->t_dtrace_scrpc = 0;
 	curthread->t_dtrace_astpc = 0;
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * Treat a child created by a call to vfork(2) as if it were its
 	 * parent. We know that there's only one thread of control in such a
@@ -1889,7 +1923,7 @@
 	case REG_ERR:		return (rp->r_err);
 	case REG_RIP:		return (rp->r_rip);
 	case REG_CS:		return (rp->r_cs);
-#if defined(sun)
+#ifdef illumos
 	case REG_RFL:		return (rp->r_rfl);
 #endif
 	case REG_RSP:		return (rp->r_rsp);

Added: trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c	                        (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,31 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+/*
+ *	XXX: Placeholder for MIPS fasttrap code
+ */


Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h	                        (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,49 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_FASTTRAP_ISA_H
+#define	_FASTTRAP_ISA_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+/*
+ *  XXXDTRACE: placehodler for MIPS fasttrap stuff
+ */
+
+typedef	uint32_t	fasttrap_instr_t;
+#define	FASTTRAP_SUNWDTRACE_SIZE	64
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _FASTTRAP_ISA_H */


Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c	                        (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,583 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/fasttrap_isa.h>
+#include <sys/fasttrap_impl.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/ptrace.h>
+#include <sys/rmlock.h>
+#include <sys/sysent.h>
+
+#define OP(x)	((x) >> 26)
+#define OPX(x)	(((x) >> 2) & 0x3FF)
+#define OP_BO(x) (((x) & 0x03E00000) >> 21)
+#define OP_BI(x) (((x) & 0x001F0000) >> 16)
+#define OP_RS(x) (((x) & 0x03E00000) >> 21)
+#define OP_RA(x) (((x) & 0x001F0000) >> 16)
+#define OP_RB(x) (((x) & 0x0000F100) >> 11)
+
+
+static int
+proc_ops(int op, proc_t *p, void *kaddr, off_t uaddr, size_t len)
+{
+	struct iovec iov;
+	struct uio uio;
+
+	iov.iov_base = kaddr;
+	iov.iov_len = len;
+	uio.uio_offset = uaddr;
+	uio.uio_iov = &iov;
+	uio.uio_resid = len;
+	uio.uio_iovcnt = 1;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_td = curthread;
+	uio.uio_rw = op;
+	PHOLD(p);
+	if (proc_rwmem(p, &uio) != 0) {
+		PRELE(p);
+		return (-1);
+	}
+	PRELE(p);
+
+	return (0);
+}
+
+static int
+uread(proc_t *p, void *kaddr, size_t len, uintptr_t uaddr)
+{
+
+	return (proc_ops(UIO_READ, p, kaddr, uaddr, len));
+}
+
+static int
+uwrite(proc_t *p, void *kaddr, size_t len, uintptr_t uaddr)
+{
+
+	return (proc_ops(UIO_WRITE, p, kaddr, uaddr, len));
+}
+
+int
+fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+	fasttrap_instr_t instr = FASTTRAP_INSTR;
+
+	if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
+		return (-1);
+
+	return (0);
+}
+
+int
+fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+	uint32_t instr;
+
+	/*
+	 * Distinguish between read or write failures and a changed
+	 * instruction.
+	 */
+	if (uread(p, &instr, 4, tp->ftt_pc) != 0)
+		return (0);
+	if (instr != FASTTRAP_INSTR)
+		return (0);
+	if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
+		return (-1);
+
+	return (0);
+}
+
+int
+fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
+    fasttrap_probe_type_t type)
+{
+	uint32_t instr;
+	//int32_t disp;
+
+	/*
+	 * Read the instruction at the given address out of the process's
+	 * address space. We don't have to worry about a debugger
+	 * changing this instruction before we overwrite it with our trap
+	 * instruction since P_PR_LOCK is set.
+	 */
+	if (uread(p, &instr, 4, pc) != 0)
+		return (-1);
+
+	/*
+	 * Decode the instruction to fill in the probe flags. We can have
+	 * the process execute most instructions on its own using a pc/npc
+	 * trick, but pc-relative control transfer present a problem since
+	 * we're relocating the instruction. We emulate these instructions
+	 * in the kernel. We assume a default type and over-write that as
+	 * needed.
+	 *
+	 * pc-relative instructions must be emulated for correctness;
+	 * other instructions (which represent a large set of commonly traced
+	 * instructions) are emulated or otherwise optimized for performance.
+	 */
+	tp->ftt_type = FASTTRAP_T_COMMON;
+	tp->ftt_instr = instr;
+
+	switch (OP(instr)) {
+	/* The following are invalid for trapping (invalid opcodes, tw/twi). */
+	case 0:
+	case 1:
+	case 2:
+	case 4:
+	case 5:
+	case 6:
+	case 30:
+	case 39:
+	case 58:
+	case 62:
+	case 3:	/* twi */
+		return (-1);
+	case 31:	/* tw */
+		if (OPX(instr) == 4)
+			return (-1);
+		else if (OPX(instr) == 444 && OP_RS(instr) == OP_RA(instr) &&
+		    OP_RS(instr) == OP_RB(instr))
+			tp->ftt_type = FASTTRAP_T_NOP;
+		break;
+	case 16:
+		tp->ftt_type = FASTTRAP_T_BC;
+		tp->ftt_dest = instr & 0x0000FFFC; /* Extract target address */
+		if (instr & 0x00008000)
+			tp->ftt_dest |= 0xFFFF0000;
+		/* Use as offset if not absolute address. */
+		if (!(instr & 0x02))
+			tp->ftt_dest += pc;
+		tp->ftt_bo = OP_BO(instr);
+		tp->ftt_bi = OP_BI(instr);
+		break;
+	case 18:
+		tp->ftt_type = FASTTRAP_T_B;
+		tp->ftt_dest = instr & 0x03FFFFFC; /* Extract target address */
+		if (instr & 0x02000000)
+			tp->ftt_dest |= 0xFC000000;
+		/* Use as offset if not absolute address. */
+		if (!(instr & 0x02))
+			tp->ftt_dest += pc;
+		break;
+	case 19:
+		switch (OPX(instr)) {
+		case 528:	/* bcctr */
+			tp->ftt_type = FASTTRAP_T_BCTR;
+			tp->ftt_bo = OP_BO(instr);
+			tp->ftt_bi = OP_BI(instr);
+			break;
+		case 16:	/* bclr */
+			tp->ftt_type = FASTTRAP_T_BCTR;
+			tp->ftt_bo = OP_BO(instr);
+			tp->ftt_bi = OP_BI(instr);
+			break;
+		};
+		break;
+	case 24:
+		if (OP_RS(instr) == OP_RA(instr) &&
+		    (instr & 0x0000FFFF) == 0)
+			tp->ftt_type = FASTTRAP_T_NOP;
+		break;
+	};
+
+	/*
+	 * We don't know how this tracepoint is going to be used, but in case
+	 * it's used as part of a function return probe, we need to indicate
+	 * whether it's always a return site or only potentially a return
+	 * site. If it's part of a return probe, it's always going to be a
+	 * return from that function if it's a restore instruction or if
+	 * the previous instruction was a return. If we could reliably
+	 * distinguish jump tables from return sites, this wouldn't be
+	 * necessary.
+	 */
+#if 0
+	if (tp->ftt_type != FASTTRAP_T_RESTORE &&
+	    (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
+	    !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
+		tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
+#endif
+
+	return (0);
+}
+
+static uint64_t
+fasttrap_anarg(struct reg *rp, int argno)
+{
+	uint64_t value;
+	proc_t  *p = curproc;
+
+	/* The first 8 arguments are in registers. */
+	if (argno < 8)
+		return rp->fixreg[argno + 3];
+
+	/* Arguments on stack start after SP+LR (2 register slots). */
+	if (SV_PROC_FLAG(p, SV_ILP32)) {
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		value = dtrace_fuword32((void *)(rp->fixreg[1] + 8 +
+		    ((argno - 8) * sizeof(uint32_t))));
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+	} else {
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		value = dtrace_fuword64((void *)(rp->fixreg[1] + 16 +
+		    ((argno - 8) * sizeof(uint32_t))));
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+	}
+	return value;
+}
+
+uint64_t
+fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+    int aframes)
+{
+	struct reg r;
+
+	fill_regs(curthread, &r);
+
+	return (fasttrap_anarg(&r, argno));
+}
+
+uint64_t
+fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+    int aframes)
+{
+	struct reg r;
+
+	fill_regs(curthread, &r);
+
+	return (fasttrap_anarg(&r, argno));
+}
+
+static void
+fasttrap_usdt_args(fasttrap_probe_t *probe, struct reg *rp, int argc,
+    uintptr_t *argv)
+{
+	int i, x, cap = MIN(argc, probe->ftp_nargs);
+
+	for (i = 0; i < cap; i++) {
+		x = probe->ftp_argmap[i];
+
+		if (x < 8)
+			argv[i] = rp->fixreg[x];
+		else
+			if (SV_PROC_FLAG(curproc, SV_ILP32))
+				argv[i] = fuword32((void *)(rp->fixreg[1] + 8 +
+				    (x * sizeof(uint32_t))));
+			else
+				argv[i] = fuword32((void *)(rp->fixreg[1] + 16 +
+				    (x * sizeof(uint64_t))));
+	}
+
+	for (; i < argc; i++) {
+		argv[i] = 0;
+	}
+}
+
+static void
+fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
+    uintptr_t new_pc)
+{
+	struct rm_priotracker tracker;
+	fasttrap_tracepoint_t *tp;
+	fasttrap_bucket_t *bucket;
+	fasttrap_id_t *id;
+
+	rm_rlock(&fasttrap_tp_lock, &tracker);
+	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+		    tp->ftt_proc->ftpc_acount != 0)
+			break;
+	}
+
+	/*
+	 * Don't sweat it if we can't find the tracepoint again; unlike
+	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
+	 * is not essential to the correct execution of the process.
+	 */
+	if (tp == NULL) {
+		rm_runlock(&fasttrap_tp_lock, &tracker);
+		return;
+	}
+
+	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
+		/*
+		 * If there's a branch that could act as a return site, we
+		 * need to trace it, and check here if the program counter is
+		 * external to the function.
+		 */
+		/* Skip function-local branches. */
+		if ((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize)
+			continue;
+
+		dtrace_probe(id->fti_probe->ftp_id,
+		    pc - id->fti_probe->ftp_faddr,
+		    rp->fixreg[3], rp->fixreg[4], 0, 0);
+	}
+	rm_runlock(&fasttrap_tp_lock, &tracker);
+}
+
+
+static int
+fasttrap_branch_taken(int bo, int bi, struct reg *regs)
+{
+	int crzero = 0;
+
+	/* Branch always? */
+	if ((bo & 0x14) == 0x14)
+		return 1;
+
+	/* Handle decrementing ctr */
+	if (!(bo & 0x04)) {
+		--regs->ctr;
+		crzero = (regs->ctr == 0);
+		if (bo & 0x10) {
+			return (!(crzero ^ (bo >> 1)));
+		}
+	}
+
+	return (crzero | (((regs->cr >> (31 - bi)) ^ (bo >> 3)) ^ 1));
+}
+
+
+int
+fasttrap_pid_probe(struct reg *rp)
+{
+	struct rm_priotracker tracker;
+	proc_t *p = curproc;
+	uintptr_t pc = rp->pc;
+	uintptr_t new_pc = 0;
+	fasttrap_bucket_t *bucket;
+	fasttrap_tracepoint_t *tp, tp_local;
+	pid_t pid;
+	dtrace_icookie_t cookie;
+	uint_t is_enabled = 0;
+
+	/*
+	 * It's possible that a user (in a veritable orgy of bad planning)
+	 * could redirect this thread's flow of control before it reached the
+	 * return probe fasttrap. In this case we need to kill the process
+	 * since it's in a unrecoverable state.
+	 */
+	if (curthread->t_dtrace_step) {
+		ASSERT(curthread->t_dtrace_on);
+		fasttrap_sigtrap(p, curthread, pc);
+		return (0);
+	}
+
+	/*
+	 * Clear all user tracing flags.
+	 */
+	curthread->t_dtrace_ft = 0;
+	curthread->t_dtrace_pc = 0;
+	curthread->t_dtrace_npc = 0;
+	curthread->t_dtrace_scrpc = 0;
+	curthread->t_dtrace_astpc = 0;
+
+	rm_rlock(&fasttrap_tp_lock, &tracker);
+	pid = p->p_pid;
+	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+	/*
+	 * Lookup the tracepoint that the process just hit.
+	 */
+	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+		    tp->ftt_proc->ftpc_acount != 0)
+			break;
+	}
+
+	/*
+	 * If we couldn't find a matching tracepoint, either a tracepoint has
+	 * been inserted without using the pid<pid> ioctl interface (see
+	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
+	 */
+	if (tp == NULL) {
+		rm_runlock(&fasttrap_tp_lock, &tracker);
+		return (-1);
+	}
+
+	if (tp->ftt_ids != NULL) {
+		fasttrap_id_t *id;
+
+		for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
+			fasttrap_probe_t *probe = id->fti_probe;
+
+			if (id->fti_ptype == DTFTP_ENTRY) {
+				/*
+				 * We note that this was an entry
+				 * probe to help ustack() find the
+				 * first caller.
+				 */
+				cookie = dtrace_interrupt_disable();
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
+				dtrace_probe(probe->ftp_id, rp->fixreg[3],
+						rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+						rp->fixreg[7]);
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
+				dtrace_interrupt_enable(cookie);
+			} else if (id->fti_ptype == DTFTP_IS_ENABLED) {
+				/*
+				 * Note that in this case, we don't
+				 * call dtrace_probe() since it's only
+				 * an artificial probe meant to change
+				 * the flow of control so that it
+				 * encounters the true probe.
+				 */
+				is_enabled = 1;
+			} else if (probe->ftp_argmap == NULL) {
+				dtrace_probe(probe->ftp_id, rp->fixreg[3],
+				    rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+				    rp->fixreg[7]);
+			} else {
+				uintptr_t t[5];
+
+				fasttrap_usdt_args(probe, rp,
+				    sizeof (t) / sizeof (t[0]), t);
+
+				dtrace_probe(probe->ftp_id, t[0], t[1],
+				    t[2], t[3], t[4]);
+			}
+		}
+	}
+
+	/*
+	 * We're about to do a bunch of work so we cache a local copy of
+	 * the tracepoint to emulate the instruction, and then find the
+	 * tracepoint again later if we need to light up any return probes.
+	 */
+	tp_local = *tp;
+	rm_runlock(&fasttrap_tp_lock, &tracker);
+	tp = &tp_local;
+
+	/*
+	 * If there's an is-enabled probe connected to this tracepoint it
+	 * means that there was a 'xor r3, r3, r3'
+	 * instruction that was placed there by DTrace when the binary was
+	 * linked. As this probe is, in fact, enabled, we need to stuff 1
+	 * into R3. Accordingly, we can bypass all the instruction
+	 * emulation logic since we know the inevitable result. It's possible
+	 * that a user could construct a scenario where the 'is-enabled'
+	 * probe was on some other instruction, but that would be a rather
+	 * exotic way to shoot oneself in the foot.
+	 */
+	if (is_enabled) {
+		rp->fixreg[3] = 1;
+		new_pc = rp->pc + 4;
+		goto done;
+	}
+
+
+	switch (tp->ftt_type) {
+	case FASTTRAP_T_NOP:
+		new_pc = rp->pc + 4;
+		break;
+	case FASTTRAP_T_BC:
+		if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+			break;
+		/* FALLTHROUGH */
+	case FASTTRAP_T_B:
+		if (tp->ftt_instr & 0x01)
+			rp->lr = rp->pc + 4;
+		new_pc = tp->ftt_dest;
+		break;
+	case FASTTRAP_T_BLR:
+	case FASTTRAP_T_BCTR:
+		if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+			break;
+		/* FALLTHROUGH */
+		if (tp->ftt_type == FASTTRAP_T_BCTR)
+			new_pc = rp->ctr;
+		else
+			new_pc = rp->lr;
+		if (tp->ftt_instr & 0x01)
+			rp->lr = rp->pc + 4;
+		break;
+	case FASTTRAP_T_COMMON:
+		break;
+	};
+done:
+	/*
+	 * If there were no return probes when we first found the tracepoint,
+	 * we should feel no obligation to honor any return probes that were
+	 * subsequently enabled -- they'll just have to wait until the next
+	 * time around.
+	 */
+	if (tp->ftt_retids != NULL) {
+		/*
+		 * We need to wait until the results of the instruction are
+		 * apparent before invoking any return probes. If this
+		 * instruction was emulated we can just call
+		 * fasttrap_return_common(); if it needs to be executed, we
+		 * need to wait until the user thread returns to the kernel.
+		 */
+		if (tp->ftt_type != FASTTRAP_T_COMMON) {
+			fasttrap_return_common(rp, pc, pid, new_pc);
+		} else {
+			ASSERT(curthread->t_dtrace_ret != 0);
+			ASSERT(curthread->t_dtrace_pc == pc);
+			ASSERT(curthread->t_dtrace_scrpc != 0);
+			ASSERT(new_pc == curthread->t_dtrace_astpc);
+		}
+	}
+
+	rp->pc = new_pc;
+	set_regs(curthread, rp);
+
+	return (0);
+}
+
+int
+fasttrap_return_probe(struct reg *rp)
+{
+	proc_t *p = curproc;
+	uintptr_t pc = curthread->t_dtrace_pc;
+	uintptr_t npc = curthread->t_dtrace_npc;
+
+	curthread->t_dtrace_pc = 0;
+	curthread->t_dtrace_npc = 0;
+	curthread->t_dtrace_scrpc = 0;
+	curthread->t_dtrace_astpc = 0;
+
+	/*
+	 * We set rp->pc to the address of the traced instruction so
+	 * that it appears to dtrace_probe() that we're on the original
+	 * instruction, and so that the user can't easily detect our
+	 * complex web of lies. dtrace_return_probe() (our caller)
+	 * will correctly set %pc after we return.
+	 */
+	rp->pc = pc;
+
+	fasttrap_return_common(rp, pc, p->p_pid, npc);
+
+	return (0);
+}
+


Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h	                        (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h	2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,77 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_FASTTRAP_ISA_H
+#define	_FASTTRAP_ISA_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	FASTTRAP_SUNWDTRACE_SIZE	64
+#define FASTTRAP_INSTR			0x0FFFDDDD
+
+typedef	uint32_t	fasttrap_instr_t;
+
+typedef struct fasttrap_machtp_t {
+	fasttrap_instr_t	ftmt_instr;	/* original instruction */
+	uintptr_t		ftmt_dest;	/* branch target */
+	uint8_t			ftmt_type;	/* emulation type */
+	uint8_t			ftmt_flags;	/* emulation flags */
+	uint8_t			ftmt_bo;	/* BO field */
+	uint8_t			ftmt_bi;	/* BI field (CR bit) */
+} fasttrap_machtp_t;
+
+#define	ftt_instr	ftt_mtp.ftmt_instr
+#define	ftt_dest	ftt_mtp.ftmt_dest
+#define	ftt_type	ftt_mtp.ftmt_type
+#define	ftt_flags	ftt_mtp.ftmt_flags
+#define	ftt_bo		ftt_mtp.ftmt_bo
+#define	ftt_bi		ftt_mtp.ftmt_bi
+
+#define FASTTRAP_T_COMMON	0x00
+#define FASTTRAP_T_B		0x01
+#define FASTTRAP_T_BC		0x02
+#define FASTTRAP_T_BLR		0x03
+#define FASTTRAP_T_BCTR		0x04
+#define FASTTRAP_T_NOP		0x05
+
+#define	FASTTRAP_AFRAMES		3
+#define	FASTTRAP_RETURN_AFRAMES		4
+#define	FASTTRAP_ENTRY_AFRAMES		3
+#define	FASTTRAP_OFFSET_AFRAMES		3
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _FASTTRAP_ISA_H */


Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c	2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c	2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * CDDL HEADER START
  *
@@ -24,8 +25,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
@@ -1410,7 +1409,7 @@
 		value = dtrace_getreg_win(reg, 1);
 		dtrace_interrupt_enable(cookie);
 
-		atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
+		atomic_inc_64(&fasttrap_getreg_fast_cnt);
 
 		return (value);
 	}
@@ -1435,7 +1434,7 @@
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
-				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
 				return (rwin[i].rw_local[reg - 16]);
 			} while (i > 0);
 		}
@@ -1455,7 +1454,7 @@
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
-				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
 				return (rwin[i].rw_local[reg - 16]);
 			} while (i > 0);
 		}
@@ -1466,7 +1465,7 @@
 		v32[0] = 0;
 	}
 
-	atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
+	atomic_inc_64(&fasttrap_getreg_slow_cnt);
 	return (value);
 
 err:
@@ -1505,7 +1504,7 @@
 	if (dtrace_getotherwin() > 0) {
 		dtrace_putreg_win(reg, value);
 		dtrace_interrupt_enable(cookie);
-		atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
+		atomic_inc_64(&fasttrap_putreg_fast_cnt);
 		return;
 	}
 	dtrace_interrupt_enable(cookie);
@@ -1536,7 +1535,7 @@
 					continue;
 
 				rwin[i].rw_local[reg - 16] = value;
-				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 				return;
 			} while (i > 0);
 		}
@@ -1549,7 +1548,7 @@
 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
 			mpcb->mpcb_wbcnt++;
-			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+			atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 			return;
 		}
 	} else {
@@ -1567,7 +1566,7 @@
 					continue;
 
 				rwin[i].rw_local[reg - 16] = v32;
-				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 				return;
 			} while (i > 0);
 		}
@@ -1580,12 +1579,12 @@
 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
 			mpcb->mpcb_wbcnt++;
-			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+			atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 			return;
 		}
 	}
 
-	atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
+	atomic_inc_64(&fasttrap_putreg_slow_cnt);
 	return;
 
 err: